From 32541c7e8c695b8f5552db285ef00d0404f96f0b Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:35:25 +0100
Subject: [PATCH 001/140] ParallelStore: Stripping configuration

---
 modules/file-system/parallelstore/README.md | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index 46f0969b93..d2a0f06b0f 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -154,6 +154,8 @@ No modules.
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes |
 | <a name="input_size_gb"></a> [size\_gb](#input\_size\_gb) | Storage size of the parallelstore instance in GB. | `number` | `12000` | no |
 | <a name="input_zone"></a> [zone](#input\_zone) | Location for parallelstore instance. | `string` | n/a | yes |
+| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | File-level stripping setting, must be `"FILE_STRIPE_LEVEL_UNSPECIFIED"`, `"FILE_STRIPE_LEVEL_MIN"`, `"FILE_STRIPE_LEVEL_BALANCED"` or `"FILE_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#file_striping_setting). | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
+| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | Directory-level stripping setting, must be `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"`, `"DIRECTORY_STRIPE_LEVEL_MIN"`, `"DIRECTORY_STRIPE_LEVEL_BALANCED"` or `"DIRECTORY_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#directory_striping_setting).  | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
 
 ## Outputs
 

From 6a1e455089fe89db358106f40e3d09e7d1287e96 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:36:01 +0100
Subject: [PATCH 002/140] ParallelStore: Stripping configuration

---
 modules/file-system/parallelstore/main.tf | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index 3de3b94f3a..bb8eeb9606 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -46,11 +46,13 @@ resource "random_id" "resource_name_suffix" {
 }
 
 resource "google_parallelstore_instance" "instance" {
-  project      = var.project_id
-  instance_id  = local.id
-  location     = var.zone
-  capacity_gib = var.size_gb
-  network      = var.network_id
+  project                = var.project_id
+  instance_id            = local.id
+  location               = var.zone
+  capacity_gib           = var.size_gb
+  network                = var.network_id
+  file_stripe_level      = var.file_stripe
+  directory_stripe_level = var.directory_stripe
 
   labels = local.labels
 

From 9a18ca9e33901c53a2eb07ae6d848c805f457287 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:36:34 +0100
Subject: [PATCH 003/140] ParallelStore: Stripping configuration

---
 .../file-system/parallelstore/variables.tf    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf
index 8dcac7c528..fc86008d9b 100644
--- a/modules/file-system/parallelstore/variables.tf
+++ b/modules/file-system/parallelstore/variables.tf
@@ -91,3 +91,33 @@ variable "import_destination_path" {
   type        = string
   default     = null
 }
+  
+variable "file_stripe" {
+  description = "The parallelstore stripe level for files."
+  type        = string
+  default     = "FILE_STRIPE_LEVEL_UNSPECIFIED"
+  validation {
+    condition = contains([
+      "FILE_STRIPE_LEVEL_UNSPECIFIED",
+      "FILE_STRIPE_LEVEL_MIN",
+      "FILE_STRIPE_LEVEL_BALANCED",
+      "FILE_STRIPE_LEVEL_MAX",
+    ], var.file_stripe)
+    error_message = "var.file_stripe must be set to \"FILE_STRIPE_LEVEL_UNSPECIFIED\", \"FILE_STRIPE_LEVEL_MIN\", \"FILE_STRIPE_LEVEL_BALANCED\",  or \"FILE_STRIPE_LEVEL_MAX\""
+  }
+}
+
+variable "directory_stripe" {
+  description = "The parallelstore stripe level for directories."
+  type        = string
+  default     = "DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"
+  validation {
+    condition = contains([
+      "DIRECTORY_STRIPE_LEVEL_UNSPECIFIED",
+      "DIRECTORY_STRIPE_LEVEL_MIN",
+      "DIRECTORY_STRIPE_LEVEL_BALANCED",
+      "DIRECTORY_STRIPE_LEVEL_MAX",
+    ], var.directory_stripe)
+    error_message = "var.directory_stripe must be set to \"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED\", \"DIRECTORY_STRIPE_LEVEL_MIN\", \"DIRECTORY_STRIPE_LEVEL_BALANCED\",  or \"DIRECTORY_STRIPE_LEVEL_MAX\""
+  }
+}

From 7a639d2f013159634eeb105cc1e1fec4ba094b69 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:44:16 +0100
Subject: [PATCH 004/140] Keep alphabetical order

---
 modules/file-system/parallelstore/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index d2a0f06b0f..a977a0e587 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -143,6 +143,8 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes |
+| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | Directory-level stripping setting, must be `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"`, `"DIRECTORY_STRIPE_LEVEL_MIN"`, `"DIRECTORY_STRIPE_LEVEL_BALANCED"` or `"DIRECTORY_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#directory_striping_setting).  | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
+| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | File-level stripping setting, must be `"FILE_STRIPE_LEVEL_UNSPECIFIED"`, `"FILE_STRIPE_LEVEL_MIN"`, `"FILE_STRIPE_LEVEL_BALANCED"` or `"FILE_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#file_striping_setting). | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
 | <a name="input_import_destination_path"></a> [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no |
 | <a name="input_import_gcs_bucket_uri"></a> [import\_gcs\_bucket\_uri](#input\_import\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no |
@@ -154,8 +156,6 @@ No modules.
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes |
 | <a name="input_size_gb"></a> [size\_gb](#input\_size\_gb) | Storage size of the parallelstore instance in GB. | `number` | `12000` | no |
 | <a name="input_zone"></a> [zone](#input\_zone) | Location for parallelstore instance. | `string` | n/a | yes |
-| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | File-level stripping setting, must be `"FILE_STRIPE_LEVEL_UNSPECIFIED"`, `"FILE_STRIPE_LEVEL_MIN"`, `"FILE_STRIPE_LEVEL_BALANCED"` or `"FILE_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#file_striping_setting). | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
-| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | Directory-level stripping setting, must be `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"`, `"DIRECTORY_STRIPE_LEVEL_MIN"`, `"DIRECTORY_STRIPE_LEVEL_BALANCED"` or `"DIRECTORY_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#directory_striping_setting).  | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
 
 ## Outputs
 

From a66e570ea5b93beb817fdf5bfe974fd051e3c31c Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:47:26 +0100
Subject: [PATCH 005/140] google-beta 5.42.0 min required for stripping
 settings

---
 modules/file-system/parallelstore/versions.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/file-system/parallelstore/versions.tf b/modules/file-system/parallelstore/versions.tf
index 24069a479c..55662d1526 100644
--- a/modules/file-system/parallelstore/versions.tf
+++ b/modules/file-system/parallelstore/versions.tf
@@ -20,7 +20,7 @@ terraform {
   required_providers {
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 5.25.0"
+      version = ">= 5.42.0"
     }
 
     random = {

From 042203cbdfbccbbba20fb20bc8cd38763cf3088e Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Tue, 3 Dec 2024 13:52:03 +0100
Subject: [PATCH 006/140] tflint

---
 modules/file-system/parallelstore/main.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index bb8eeb9606..a0d6c6a60b 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -54,10 +54,10 @@ resource "google_parallelstore_instance" "instance" {
   file_stripe_level      = var.file_stripe
   directory_stripe_level = var.directory_stripe
 
-  labels = local.labels
+  labels                 = local.labels
 
-  provider   = google-beta
-  depends_on = [var.private_vpc_connection_peering]
+  provider               = google-beta
+  depends_on             = [var.private_vpc_connection_peering]
 }
 
 resource "null_resource" "hydration" {

From 8c1eab2b29c5055650aaa353fe3684d023508fc0 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Fri, 6 Dec 2024 21:15:52 +0100
Subject: [PATCH 007/140] Fixing README.md changes

---
 modules/file-system/parallelstore/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index a977a0e587..46073d2643 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -114,7 +114,7 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13 |
-| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 5.25.0 |
+| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 5.42.0 |
 | <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
 | <a name="requirement_random"></a> [random](#requirement\_random) | ~> 3.0 |
 
@@ -122,7 +122,7 @@ limitations under the License.
 
 | Name | Version |
 |------|---------|
-| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 5.25.0 |
+| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 5.42.0 |
 | <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
 | <a name="provider_random"></a> [random](#provider\_random) | ~> 3.0 |
 
@@ -143,8 +143,8 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes |
-| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | Directory-level stripping setting, must be `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"`, `"DIRECTORY_STRIPE_LEVEL_MIN"`, `"DIRECTORY_STRIPE_LEVEL_BALANCED"` or `"DIRECTORY_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#directory_striping_setting).  | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
-| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | File-level stripping setting, must be `"FILE_STRIPE_LEVEL_UNSPECIFIED"`, `"FILE_STRIPE_LEVEL_MIN"`, `"FILE_STRIPE_LEVEL_BALANCED"` or `"FILE_STRIPE_LEVEL_MAX"`. More details in the [documentation](https://cloud.google.com/parallelstore/docs/performance#file_striping_setting). | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
+| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | The parallelstore stripe level for directories. | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
+| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | The parallelstore stripe level for files. | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
 | <a name="input_import_destination_path"></a> [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no |
 | <a name="input_import_gcs_bucket_uri"></a> [import\_gcs\_bucket\_uri](#input\_import\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no |

From 49fc50840cdc1641d754db45977bf3580dbcd2f1 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Fri, 6 Dec 2024 21:16:36 +0100
Subject: [PATCH 008/140] Fixing indent

---
 modules/file-system/parallelstore/main.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index a0d6c6a60b..bb8eeb9606 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -54,10 +54,10 @@ resource "google_parallelstore_instance" "instance" {
   file_stripe_level      = var.file_stripe
   directory_stripe_level = var.directory_stripe
 
-  labels                 = local.labels
+  labels = local.labels
 
-  provider               = google-beta
-  depends_on             = [var.private_vpc_connection_peering]
+  provider   = google-beta
+  depends_on = [var.private_vpc_connection_peering]
 }
 
 resource "null_resource" "hydration" {

From 0ea8282b577af5a9c8e7f7ae0583a925f603d604 Mon Sep 17 00:00:00 2001
From: dgouju <dgouju@users.noreply.github.com>
Date: Fri, 6 Dec 2024 21:17:10 +0100
Subject: [PATCH 009/140] Removing space

---
 modules/file-system/parallelstore/variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf
index fc86008d9b..d2a61d6392 100644
--- a/modules/file-system/parallelstore/variables.tf
+++ b/modules/file-system/parallelstore/variables.tf
@@ -91,7 +91,7 @@ variable "import_destination_path" {
   type        = string
   default     = null
 }
-  
+
 variable "file_stripe" {
   description = "The parallelstore stripe level for files."
   type        = string

From cef26f3a17385454958dbd2ecb0b80f4303b8692 Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Tue, 10 Dec 2024 12:04:26 +0000
Subject: [PATCH 010/140] a test PR

---
 modules/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/README.md b/modules/README.md
index c5f1df282a..93f9dc73b3 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -586,5 +586,5 @@ than having to set it manually.
 
 ## Writing Custom Cluster Toolkit Modules
 
-Modules are flexible by design, however we do define some [best practices](../docs/module-guidelines.md) when
+Modules are flexible by design, however we define some [best practices](../docs/module-guidelines.md) when
 creating a new module meant to be used with the Cluster Toolkit.

From c60838ca206f52608efee61a93aaefff9f4c8aea Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Wed, 11 Dec 2024 20:45:53 +0000
Subject: [PATCH 011/140] expose deletion protection

---
 modules/scheduler/gke-cluster/README.md    |  1 +
 modules/scheduler/gke-cluster/main.tf      |  3 +--
 modules/scheduler/gke-cluster/variables.tf | 10 ++++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 74f1ac0ba3..675039add6 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -145,6 +145,7 @@ limitations under the License.
 | <a name="input_cluster_reference_type"></a> [cluster\_reference\_type](#input\_cluster\_reference\_type) | How the google\_container\_node\_pool.system\_node\_pools refers to the cluster. Possible values are: {SELF\_LINK, NAME} | `string` | `"SELF_LINK"` | no |
 | <a name="input_configure_workload_identity_sa"></a> [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no |
 | <a name="input_default_max_pods_per_node"></a> [default\_max\_pods\_per\_node](#input\_default\_max\_pods\_per\_node) | The default maximum number of pods per node in this cluster. | `number` | `null` | no |
+| <a name="input_deletion_protection"></a> [deletion\_protection](#input\_deletion\_protection) | "Determines if the cluster can be deleted by gcluster commands or not".<br/>To delete a cluster provisioned with deletion\_protection set to true, you must first set it to false and apply the changes.<br/>Then proceed with deletion as usual. | `bool` | `false` | no |
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. Used in the GKE cluster name by default and can be configured with `prefix_with_deployment_name`. | `string` | n/a | yes |
 | <a name="input_enable_dataplane_v2"></a> [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true | `bool` | `null` | no |
 | <a name="input_enable_dcgm_monitoring"></a> [enable\_dcgm\_monitoring](#input\_enable\_dcgm\_monitoring) | Enable GKE to collect DCGM metrics | `bool` | `false` | no |
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 48a225d5e8..5b416a85bb 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -72,8 +72,7 @@ resource "google_container_cluster" "gke_cluster" {
   remove_default_node_pool = true
   initial_node_count       = 1 # must be set when remove_default_node_pool is set
 
-  # Sets default to false so terraform deletion is not prevented
-  deletion_protection = false
+  deletion_protection = var.deletion_protection
 
   network    = var.network_id
   subnetwork = var.subnetwork_self_link
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index 9231b2e193..58bf197763 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -397,3 +397,13 @@ variable "networking_mode" {
   type        = string
   default     = "VPC_NATIVE"
 }
+
+variable "deletion_protection" {
+  description = <<-EOT
+  "Determines if the cluster can be deleted by gcluster commands or not".
+  To delete a cluster provisioned with deletion_protection set to true, you must first set it to false and apply the changes.
+  Then proceed with deletion as usual.
+  EOT
+  type        = bool
+  default     = false
+}

From ed820b840fa6979b0dd9c1ab20e8deb25715a068 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 12 Dec 2024 17:39:35 +0000
Subject: [PATCH 012/140] SlurmGCP. Replace `to_hostlist` with
 `to_hostlist_fast`

---
 .../modules/slurm_files/scripts/conf.py       |  4 ++--
 .../modules/slurm_files/scripts/resume.py     | 22 +++++++++----------
 .../modules/slurm_files/scripts/slurmsync.py  | 18 +++++++--------
 .../modules/slurm_files/scripts/suspend.py    | 12 +++++-----
 .../slurm_files/scripts/tests/test_util.py    |  4 ++--
 .../modules/slurm_files/scripts/util.py       | 18 +++------------
 6 files changed, 33 insertions(+), 45 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index c4bb37c579..4af58a7831 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -410,9 +410,9 @@ def __init__(
     def conf_line(self) -> str:
         d = {"SwitchName": self.name}
         if self.nodes:
-            d["Nodes"] = util.to_hostlist_fast(self.nodes)
+            d["Nodes"] = util.to_hostlist(self.nodes)
         if self.switches:
-            d["Switches"] = util.to_hostlist_fast(self.switches.keys())
+            d["Switches"] = util.to_hostlist(self.switches.keys())
         return dict_to_conf(d)
 
     def render_conf_lines(self) -> Iterable[str]:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 9a5e0b035b..669ccfc0a7 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -37,7 +37,7 @@
     map_with_futures,
     run,
     separate,
-    to_hostlist_fast,
+    to_hostlist,
     trim_self_link,
     wait_for_operation,
 )
@@ -220,7 +220,7 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e
         project=lookup().project, 
         body=body.to_dict(), 
         **method_args)
-    log.debug(f"new request: endpoint={req.methodId} nodes={to_hostlist_fast(nodes)}")
+    log.debug(f"new request: endpoint={req.methodId} nodes={to_hostlist(nodes)}")
     log_api_request(req)
     return req
 
@@ -340,7 +340,7 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
 
     if log.isEnabledFor(logging.DEBUG):
         grouped_nodelists = {
-            group: to_hostlist_fast(chunk.nodes) for group, chunk in grouped_nodes.items()
+            group: to_hostlist(chunk.nodes) for group, chunk in grouped_nodes.items()
         }
         log.debug(
             "node bulk groups: \n{}".format(yaml.safe_dump(grouped_nodelists).rstrip())
@@ -401,7 +401,7 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
         bulk_op_name = bulk_op["name"]
         if "error" in bulk_op:
             error = bulk_op["error"]["errors"][0]
-            group_nodes = to_hostlist_fast(grouped_nodes[group].nodes)
+            group_nodes = to_hostlist(grouped_nodes[group].nodes)
             log.warning(
                 f"bulkInsert operation errors: {error['code']} name={bulk_op_name} operationGroupId={group_id} nodes={group_nodes}"
             )
@@ -433,14 +433,14 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
 
         ready_nodes = {trim_self_link(op["targetLink"]) for op in successful_inserts}
         if len(ready_nodes) > 0:
-            ready_nodelist = to_hostlist_fast(ready_nodes)
+            ready_nodelist = to_hostlist(ready_nodes)
             log.info(f"created {len(ready_nodes)} instances: nodes={ready_nodelist}")
             all_successful_inserts.extend(successful_inserts)
 
 
 def down_nodes_notify_jobs(nodes: List[str], reason: str, resume_data: Optional[ResumeData]) -> None:
     """set nodes down with reason"""
-    nodelist = util.to_hostlist_fast(nodes)
+    nodelist = util.to_hostlist(nodes)
     reason_quoted = shlex.quote(reason)
     
     log.error(f"Marking nodes {nodelist} as DOWN, reason: {reason}")
@@ -536,7 +536,7 @@ def _allocate_nodes_to_placements(nodes: List[str], excl_job_id:Optional[int], l
 
     if invalid:
         placements.append(PlacementAndNodes(placement=None, nodes=invalid))
-        log.error(f"Could not find placement for nodes with unexpected names: {to_hostlist_fast(invalid)}")
+        log.error(f"Could not find placement for nodes with unexpected names: {to_hostlist(invalid)}")
 
     return placements
 
@@ -545,7 +545,7 @@ def create_nodeset_placements(nodes: List[str], excl_job_id:Optional[int], lkp:
     region = lkp.node_region(nodes[0])
 
     if log.isEnabledFor(logging.DEBUG):
-        debug_p = {p.placement: to_hostlist_fast(p.nodes) for p in placements}
+        debug_p = {p.placement: to_hostlist(p.nodes) for p in placements}
         log.debug(
             f"creating {len(placements)} placement groups: \n{yaml.safe_dump(debug_p).rstrip()}"
         )
@@ -591,7 +591,7 @@ def classify_result(item):
             )
 
     log.info(
-        f"created {len(operations)} placement groups ({to_hostlist_fast(operations.keys())})"
+        f"created {len(operations)} placement groups ({to_hostlist(operations.keys())})"
     )
     return placements
 
@@ -617,7 +617,7 @@ def main(nodelist: str) -> None:
     )
     if other_nodes:
         log.error(
-            f"Ignoring non-power-managed nodes '{to_hostlist_fast(other_nodes)}' from '{nodelist}'"
+            f"Ignoring non-power-managed nodes '{to_hostlist(other_nodes)}' from '{nodelist}'"
         )
 
     if not nodes:
@@ -625,7 +625,7 @@ def main(nodelist: str) -> None:
         return
 
     resume_data = get_resume_file_data()
-    log.info(f"resume {util.to_hostlist_fast(nodes)}")
+    log.info(f"resume {util.to_hostlist(nodes)}")
     resume_nodes(nodes, resume_data)
     
 if __name__ == "__main__":
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index d21211e8e7..4cb8fea7a5 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -38,7 +38,7 @@
     install_custom_scripts,
     run,
     separate,
-    to_hostlist_fast,
+    to_hostlist,
     NSDict,
     NodeState,
     TPU,
@@ -65,28 +65,28 @@ def __hash__(self):
 @dataclass(frozen=True)
 class NodeActionPowerUp():
     def apply(self, nodes:List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} instances to resume ({hostlist})")
         run(f"{lookup().scontrol} update nodename={hostlist} state=power_up")
 
 @dataclass(frozen=True)
 class NodeActionIdle():
     def apply(self, nodes:List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} nodes to idle ({hostlist})")
         run(f"{lookup().scontrol} update nodename={hostlist} state=resume")
 
 @dataclass(frozen=True)
 class NodeActionPowerDown():
     def apply(self, nodes:List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} instances to power down ({hostlist})")
         run(f"{lookup().scontrol} update nodename={hostlist} state=power_down")
 
 @dataclass(frozen=True)
 class NodeActionDelete():
     def apply(self, nodes:List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} instances to delete ({hostlist})")
         delete_instances(nodes)
 
@@ -94,7 +94,7 @@ def apply(self, nodes:List[str]) -> None:
 class NodeActionPrempt():
     def apply(self, nodes:List[str]) -> None:
         NodeActionDown(reason="Preempted instance").apply(nodes)
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} instances restarted ({hostlist})")
         start_instances(nodes)
 
@@ -108,7 +108,7 @@ class NodeActionDown():
     reason: str
 
     def apply(self, nodes: List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)
+        hostlist = util.to_hostlist(nodes)
         log.info(f"{len(nodes)} nodes set down ({hostlist}) with reason={self.reason}")
         run(f"{lookup().scontrol} update nodename={hostlist} state=down reason={shlex.quote(self.reason)}")
 
@@ -118,7 +118,7 @@ class NodeActionUnknown():
     instance_state: Optional[str]
 
     def apply(self, nodes:List[str]) -> None:
-        hostlist = util.to_hostlist_fast(nodes)    
+        hostlist = util.to_hostlist(nodes)    
         log.error(f"{len(nodes)} nodes have unexpected {self.slurm_state} and instance state:{self.instance_state}, ({hostlist})")
 
 def start_instance_op(inst):
@@ -327,7 +327,7 @@ def ignore_err(e) -> bool:
         if failures:
             log.error(f"some placement groups failed to delete: {failures}")
     log.info(
-        f"deleted {len(done)} of {len(placement_groups)} placement groups ({to_hostlist_fast(done.keys())})"
+        f"deleted {len(done)} of {len(placement_groups)} placement groups ({to_hostlist(done.keys())})"
     )
 
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
index 4866dffb1e..dc901b6aba 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
@@ -24,7 +24,7 @@
     groupby_unsorted,
     log_api_request,
     batch_execute,
-    to_hostlist_fast,
+    to_hostlist,
     wait_for_operations,
     separate,
     execute_with_futures,
@@ -96,14 +96,14 @@ def delete_instances(instances):
 
     requests = {inst: delete_instance_request(inst) for inst in valid}
 
-    log.info(f"delete {len(valid)} instances ({to_hostlist_fast(valid)})")
+    log.info(f"delete {len(valid)} instances ({to_hostlist(valid)})")
     done, failed = batch_execute(requests)
     if failed:
         for err, nodes in groupby_unsorted(lambda n: failed[n][1], failed.keys()):
-            log.error(f"instances failed to delete: {err} ({to_hostlist_fast(nodes)})")
+            log.error(f"instances failed to delete: {err} ({to_hostlist(nodes)})")
     wait_for_operations(done.values())
     # TODO do we need to check each operation for success? That is a lot more API calls
-    log.info(f"deleted {len(done)} instances {to_hostlist_fast(done.keys())}")
+    log.info(f"deleted {len(done)} instances {to_hostlist(done.keys())}")
 
 
 def suspend_nodes(nodes: List[str]) -> None:
@@ -128,10 +128,10 @@ def main(nodelist):
     )
     if other_nodes:
         log.debug(
-            f"Ignoring non-power-managed nodes '{to_hostlist_fast(other_nodes)}' from '{nodelist}'"
+            f"Ignoring non-power-managed nodes '{to_hostlist(other_nodes)}' from '{nodelist}'"
         )
     if pm_nodes:
-        log.debug(f"Suspending nodes '{to_hostlist_fast(pm_nodes)}' from '{nodelist}'")
+        log.debug(f"Suspending nodes '{to_hostlist(pm_nodes)}' from '{nodelist}'")
     else:
         log.debug("No cloud nodes to suspend")
         return
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
index 2807740464..9cf36b2776 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
@@ -108,8 +108,8 @@ def test_node_desc_fail(name):
         ("seas7-0,seas7-1", "seas7-[0-1]"),
     ],
 )
-def test_to_hostlist_fast(names, expected):
-    assert util.to_hostlist_fast(names.split(",")) == expected
+def test_to_hostlist(names, expected):
+    assert util.to_hostlist(names.split(",")) == expected
 
 
 @pytest.mark.parametrize(
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 605283c5bb..096e6d974c 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -877,25 +877,13 @@ def atoi(text):
 
     return [atoi(w) for w in re.split(r"(\d+)", text)]
 
-# TODO: replace with to_hostlist_fast
-def to_hostlist(nodenames) -> str:
-    """make hostlist from list of node names"""
-    # use tmp file because list could be large
-    tmp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False)
-    tmp_file.writelines("\n".join(sorted(nodenames, key=natural_sort)))
-    tmp_file.close()
 
-    hostlist = run(f"{lookup().scontrol} show hostlist {tmp_file.name}").stdout.rstrip()
-    os.remove(tmp_file.name)
-    return hostlist
-
-
-def to_hostlist_fast(names: Iterable[str]) -> str:
+def to_hostlist(names: Iterable[str]) -> str:
     """
-    Fast implementation of to_hostlist that doesn't invoke `scontrol`
+    Fast implementation of `hostlist` that doesn't invoke `scontrol`
     IMPORTANT:
     * Acts as `scontrol show hostlistsorted`, i.e. original order is not preserved
-    * Achieves worse compression than `to_hostlist` for some cases
+    * Achieves worse compression than `scontrol show hostlist` for some cases
     """
     pref = defaultdict(list)
     tokenizer = re.compile(r"^(.*?)(\d*)$")

From 97f449e52ec978d38a33ac65a2ffbcd94e14bcf8 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Wed, 11 Dec 2024 01:42:55 +0000
Subject: [PATCH 013/140] Slurmsync. Safeguard against nodes missing from Slurm
 state.

---
 .../modules/slurm_files/scripts/slurmsync.py  | 23 +++--------
 .../slurm_files/scripts/tests/common.py       |  1 +
 .../slurm_files/scripts/tests/test_util.py    | 40 +++++++++++++++++++
 .../modules/slurm_files/scripts/util.py       | 36 ++++++++++++++++-
 4 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index d21211e8e7..1e682cdeee 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -225,7 +225,7 @@ def _find_tpu_node_action(nodename, state) -> NodeAction:
 
 def get_node_action(nodename: str) -> NodeAction:
     """Determine node/instance status that requires action"""
-    state = lookup().slurm_node(nodename)
+    state = lookup().node_state(nodename)
 
     if lookup().node_is_fr(nodename):
         fr = lookup().future_reservation(lookup().node_nodeset(nodename))
@@ -381,24 +381,13 @@ def sync_placement_groups():
 
 
 def sync_slurm():
-    compute_instances = [
+    compute_instances = {
         name for name, inst in lookup().instances().items() if inst.role == "compute"
-    ]
-    slurm_nodes = list(lookup().slurm_nodes().keys())
-
-    all_nodes = list(
-        set(
-            chain(
-                compute_instances,
-                slurm_nodes,
-            )
-        )
-    )
-    log.debug(
-        f"reconciling {len(compute_instances)} ({len(all_nodes)-len(compute_instances)}) GCP instances and {len(slurm_nodes)} Slurm nodes ({len(all_nodes)-len(slurm_nodes)})."
-    )
+    }
+    slurm_nodes = set(lookup().slurm_nodes().keys())
+    log.debug(f"reconciling {len(compute_instances)} GCP instances and {len(slurm_nodes)} Slurm nodes.")
 
-    for action, nodes in util.groupby_unsorted(all_nodes, get_node_action):
+    for action, nodes in util.groupby_unsorted(compute_instances | slurm_nodes, get_node_action):
         action.apply(list(nodes))
 
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
index a807c00f28..54d7f45d43 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
@@ -53,6 +53,7 @@ class TstCfg:
     partitions: dict[str, Placeholder] = field(default_factory=dict)
     nodeset: dict[str, TstNodeset] = field(default_factory=dict)
     nodeset_tpu: dict[str, TstNodeset] = field(default_factory=dict)
+    nodeset_dyn: dict[str, TstNodeset] = field(default_factory=dict)
     
     install_dir: Optional[str] = None
     output_dir: Optional[str] = None
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
index 2807740464..23fa585f84 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional, Type
+
 import pytest
 from mock import Mock
 from common import TstNodeset, TstCfg # needed to import util
 import util
+from util import NodeState
 from datetime import timedelta
 from google.api_core.client_options import ClientOptions  # noqa: E402
 
@@ -308,3 +311,40 @@ def test_nodeset_reservation_ok(nodeset, policies, expected):
 def test_parse_job_info(job_info, expected_job):
     lkp = util.Lookup(TstCfg())
     assert lkp._parse_job_info(job_info) == expected_job
+
+
+
+@pytest.mark.parametrize(
+    "node,state,want",
+    [
+        ("c-n-2", NodeState("DOWN", {}), NodeState("DOWN", {})), # happy scenario
+        ("c-d-vodoo", None, None), # dynamic nodeset
+        ("c-x-44", None, None), # unknown(removed) nodeset
+        ("c-n-7", None, None), # Out of bounds: c-n-[0-4] - downsized nodeset
+        ("c-t-7", None, None), # Out of bounds: c-t-[0-4] - downsized nodeset TPU
+        ("c-n-2", None, RuntimeError), # something is wrong
+        ("c-t-2", None, RuntimeError), # something is wrong, but TPU
+        
+        # Check boundaries match [0-5)
+        ("c-n-5", None, None), # out of boundaries
+        ("c-n-4", None, RuntimeError), # within boundaries
+    ])
+def test_node_state(node: str, state: Optional[NodeState], want: NodeState | None | Type[Exception]):
+    cfg = TstCfg(
+        slurm_cluster_name="c",
+        nodeset={
+            "n": TstNodeset(node_count_static=2, node_count_dynamic_max=3)},
+        nodeset_tpu={
+            "t": TstNodeset(node_count_static=2, node_count_dynamic_max=3)},
+        nodeset_dyn={
+            "d": TstNodeset()},
+    )
+    lkp = util.Lookup(cfg)
+    lkp.slurm_nodes = lambda: {node: state} if state else {}
+        
+    if  type(want) is type and issubclass(want, Exception):
+        with pytest.raises(want):
+            lkp.node_state(node)
+    else:
+        assert lkp.node_state(node) == want
+        
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 605283c5bb..761e506f60 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -1595,6 +1595,7 @@ def node_nodeset(self, node_name=None):
         nodeset_name = self.node_nodeset_name(node_name)
         if nodeset_name in self.cfg.nodeset_tpu:
             return self.cfg.nodeset_tpu[nodeset_name]
+
         return self.cfg.nodeset[nodeset_name]
 
     def partition_is_tpu(self, part: str) -> bool:
@@ -1700,8 +1701,39 @@ def make_node_tuple(node_line):
         }
         return nodes
 
-    def slurm_node(self, nodename):
-        return self.slurm_nodes().get(nodename)
+    def node_state(self, nodename: str) -> Optional[NodeState]:
+        state = self.slurm_nodes().get(nodename)
+        if state is not None:
+            return state
+        
+        # state is None => Slurm doesn't know this node,
+        # there are two reasons:
+        # * happy: 
+        #   * node belongs to removed nodeset
+        #   * node belongs to downsized portion of nodeset
+        #   * dynamic node that didn't register itself
+        # * unhappy:
+        #   * there is a drift in Slurm and SlurmGCP configurations
+        #   * `slurm_nodes` function failed to handle `scontrol show nodes`,
+        #      TODO: make `slurm_nodes` robust by using `scontrol show nodes --json`
+        # In either of "unhappy" cases it's too dangerous to proceed - abort slurmsync.
+        try:
+            ns = self.node_nodeset(nodename)
+        except:
+            log.info(f"Unknown node {nodename}, belongs to unknown nodeset")
+            return None # Can't find nodeset, may be belongs to removed nodeset
+        
+        if self.node_is_dyn(nodename):
+            log.info(f"Unknown node {nodename}, belongs to dynamic nodeset")
+            return None # we can't make any judjment for dynamic nodes
+        
+        cnt = sum(self.static_dynamic_sizes(ns))
+        if self.node_index(nodename) >= cnt:
+            log.info(f"Unknown node {nodename}, out of nodeset size boundaries ({cnt})")
+            return None # node belongs to downsized nodeset
+        
+        raise RuntimeError(f"Slurm does not recognize node {nodename}, potential misconfiguration.")
+
 
     @lru_cache(maxsize=1)
     def instances(self) -> Dict[str, object]:

From ce5a41df14a8edfed6eab03763e4dbc7c4030410 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 12 Dec 2024 18:01:39 +0000
Subject: [PATCH 014/140] SlurmGCP. Fix warning around `file_cache`

---
 .../modules/slurm_files/scripts/util.py                          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 605283c5bb..2e1474c050 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -373,6 +373,7 @@ def build_request(http, *args, **kwargs):
         requestBuilder=build_request,
         credentials=credentials,
         discoveryServiceUrl=disc_url,
+        cache_discovery=False, # See https://github.com/googleapis/google-api-python-client/issues/299
     )
 
 def storage_client() -> storage.Client:

From a93a9abc967a109a097a4d101ea0736640825b39 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 12 Dec 2024 18:44:21 +0000
Subject: [PATCH 015/140] SlurmGCP. Remove redundant terraform from newly
 migrated modules

---
 .../internal/slurm-gcp-v6/instance/README.md  | 11 +----
 .../internal/slurm-gcp-v6/instance/main.tf    | 35 +---------------
 .../slurm-gcp-v6/instance/variables.tf        | 42 -------------------
 .../slurm-gcp-v6/instance/versions.tf         |  4 --
 .../slurm-gcp-v6/instance_template/README.md  |  3 +-
 .../slurm-gcp-v6/instance_template/main.tf    | 18 +++-----
 .../instance_template/variables.tf            | 21 +---------
 .../internal_instance_template/README.md      |  4 +-
 .../internal_instance_template/main.tf        |  3 +-
 .../internal_instance_template/variables.tf   | 21 ----------
 .../schedmd-slurm-gcp-v6-controller/login.tf  | 10 ++---
 11 files changed, 16 insertions(+), 156 deletions(-)

diff --git a/community/modules/internal/slurm-gcp-v6/instance/README.md b/community/modules/internal/slurm-gcp-v6/instance/README.md
index fadb65bac6..ae8462d763 100644
--- a/community/modules/internal/slurm-gcp-v6/instance/README.md
+++ b/community/modules/internal/slurm-gcp-v6/instance/README.md
@@ -48,7 +48,6 @@ limitations under the License.
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | ~> 1.0 |
 | <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.43 |
-| <a name="requirement_local"></a> [local](#requirement\_local) | ~> 2.0 |
 | <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
 
 ## Providers
@@ -56,7 +55,6 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | >= 3.43 |
-| <a name="provider_local"></a> [local](#provider\_local) | ~> 2.0 |
 | <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
 
 ## Modules
@@ -71,27 +69,20 @@ No modules.
 | [null_resource.replace_trigger](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [google_compute_instance_template.base](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_instance_template) | data source |
 | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source |
-| [local_file.startup](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source |
 
 ## Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_access_config"></a> [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. | <pre>list(object({<br/>    nat_ip       = string<br/>    network_tier = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_add_hostname_suffix"></a> [add\_hostname\_suffix](#input\_add\_hostname\_suffix) | Adds a suffix to the hostname | `bool` | `true` | no |
 | <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. | <pre>list(object({<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })), [])<br/>    alias_ip_range = optional(list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    })), [])<br/>    ipv6_access_config = optional(list(object({<br/>      network_tier = string<br/>    })), [])<br/>    network            = optional(string)<br/>    network_ip         = optional(string, "")<br/>    nic_type           = optional(string)<br/>    queue_count        = optional(number)<br/>    stack_type         = optional(string)<br/>    subnetwork         = optional(string)<br/>    subnetwork_project = optional(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_hostname"></a> [hostname](#input\_hostname) | Hostname of instances | `string` | `""` | no |
-| <a name="input_hostname_suffix_separator"></a> [hostname\_suffix\_separator](#input\_hostname\_suffix\_separator) | Separator character to compose hostname when add\_hostname\_suffix is set to true. | `string` | `"-"` | no |
+| <a name="input_hostname"></a> [hostname](#input\_hostname) | Hostname of instances | `string` | n/a | yes |
 | <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | Instance template self\_link used to create compute instances | `string` | n/a | yes |
-| <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. Merged and takes precedence over labels on instance template | `map(string)` | `{}` | no |
-| <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no |
 | <a name="input_network"></a> [network](#input\_network) | Network to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no |
 | <a name="input_num_instances"></a> [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The GCP project ID | `string` | `null` | no |
 | <a name="input_region"></a> [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no |
 | <a name="input_replace_trigger"></a> [replace\_trigger](#input\_replace\_trigger) | Trigger value to replace the instances. | `string` | `""` | no |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes |
-| <a name="input_slurm_instance_role"></a> [slurm\_instance\_role](#input\_slurm\_instance\_role) | Slurm instance type. Must be one of: controller; login; compute. | `string` | `null` | no |
 | <a name="input_static_ips"></a> [static\_ips](#input\_static\_ips) | List of static IPs for VM instances | `list(string)` | `[]` | no |
 | <a name="input_subnetwork"></a> [subnetwork](#input\_subnetwork) | Subnet to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no |
 | <a name="input_subnetwork_project"></a> [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to | `string` | `null` | no |
diff --git a/community/modules/internal/slurm-gcp-v6/instance/main.tf b/community/modules/internal/slurm-gcp-v6/instance/main.tf
index 749ca9d3b3..5f3ce6e0df 100644
--- a/community/modules/internal/slurm-gcp-v6/instance/main.tf
+++ b/community/modules/internal/slurm-gcp-v6/instance/main.tf
@@ -20,20 +20,13 @@
 ##########
 
 locals {
-  hostname      = var.hostname == "" ? "default" : var.hostname
   num_instances = length(var.static_ips) == 0 ? var.num_instances : length(var.static_ips)
 
   # local.static_ips is the same as var.static_ips with a dummy element appended
   # at the end of the list to work around "list does not have any elements so cannot
   # determine type" error when var.static_ips is empty
   static_ips = concat(var.static_ips, ["NOT_AN_IP"])
-}
-
-#################
-# LOCALS: SLURM #
-#################
 
-locals {
   network_interfaces = [for index in range(local.num_instances) :
     concat([
       {
@@ -52,9 +45,6 @@ locals {
       var.additional_networks
     )
   ]
-
-  slurm_instance_role = lower(var.slurm_instance_role)
-
 }
 
 ################
@@ -71,10 +61,6 @@ data "google_compute_instance_template" "base" {
   name    = var.instance_template
 }
 
-data "local_file" "startup" {
-  filename = "${path.module}/../instance_template/files/startup_sh_unlinted"
-}
-
 #############
 # INSTANCES #
 #############
@@ -86,7 +72,7 @@ resource "null_resource" "replace_trigger" {
 
 resource "google_compute_instance_from_template" "slurm_instance" {
   count   = local.num_instances
-  name    = var.add_hostname_suffix ? format("%s%s%s", local.hostname, var.hostname_suffix_separator, format("%03d", count.index + 1)) : local.hostname
+  name    = format("%s-%s", var.hostname, format("%03d", count.index + 1))
   project = var.project_id
   zone    = var.zone == null ? data.google_compute_zones.available.names[count.index % length(data.google_compute_zones.available.names)] : var.zone
 
@@ -128,25 +114,6 @@ resource "google_compute_instance_from_template" "slurm_instance" {
 
   source_instance_template = data.google_compute_instance_template.base.self_link
 
-  # Slurm
-  labels = merge(
-    data.google_compute_instance_template.base.labels,
-    var.labels,
-    {
-      slurm_cluster_name  = var.slurm_cluster_name
-      slurm_instance_role = local.slurm_instance_role
-    },
-  )
-  metadata = merge(
-    data.google_compute_instance_template.base.metadata,
-    var.metadata,
-    {
-      slurm_cluster_name  = var.slurm_cluster_name
-      slurm_instance_role = local.slurm_instance_role
-      startup-script      = data.local_file.startup.content
-    },
-  )
-
   lifecycle {
     replace_triggered_by = [null_resource.replace_trigger.id]
   }
diff --git a/community/modules/internal/slurm-gcp-v6/instance/variables.tf b/community/modules/internal/slurm-gcp-v6/instance/variables.tf
index 697d5c4b98..11111a2c05 100644
--- a/community/modules/internal/slurm-gcp-v6/instance/variables.tf
+++ b/community/modules/internal/slurm-gcp-v6/instance/variables.tf
@@ -42,13 +42,6 @@ variable "subnetwork_project" {
 variable "hostname" {
   description = "Hostname of instances"
   type        = string
-  default     = ""
-}
-
-variable "add_hostname_suffix" {
-  description = "Adds a suffix to the hostname"
-  type        = bool
-  default     = true
 }
 
 variable "additional_networks" {
@@ -115,45 +108,10 @@ variable "zone" {
   default     = null
 }
 
-variable "hostname_suffix_separator" {
-  description = "Separator character to compose hostname when add_hostname_suffix is set to true."
-  type        = string
-  default     = "-"
-}
-
-variable "metadata" {
-  type        = map(string)
-  description = "Metadata, provided as a map"
-  default     = {}
-}
-
-variable "labels" {
-  type        = map(string)
-  description = "Labels, provided as a map. Merged and takes precedence over labels on instance template"
-  default     = {}
-}
-
 #########
 # SLURM #
 #########
 
-variable "slurm_instance_role" {
-  description = "Slurm instance type. Must be one of: controller; login; compute."
-  type        = string
-  default     = null
-
-  validation {
-    condition     = contains(["controller", "login", "compute"], lower(var.slurm_instance_role))
-    error_message = "Must be one of: controller; login; compute."
-  }
-}
-
-variable "slurm_cluster_name" {
-  description = "Cluster name, used for resource naming."
-  type        = string
-}
-
-
 variable "replace_trigger" {
   description = "Trigger value to replace the instances."
   type        = string
diff --git a/community/modules/internal/slurm-gcp-v6/instance/versions.tf b/community/modules/internal/slurm-gcp-v6/instance/versions.tf
index 293a1ef8ca..a3e84c09bf 100644
--- a/community/modules/internal/slurm-gcp-v6/instance/versions.tf
+++ b/community/modules/internal/slurm-gcp-v6/instance/versions.tf
@@ -23,10 +23,6 @@ terraform {
       source  = "hashicorp/google"
       version = ">= 3.43"
     }
-    local = {
-      source  = "hashicorp/local"
-      version = "~> 2.0"
-    }
     null = {
       source  = "hashicorp/null"
       version = "~> 3.0"
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/README.md b/community/modules/internal/slurm-gcp-v6/instance_template/README.md
index 8cef4311ca..0cd784b0c4 100644
--- a/community/modules/internal/slurm-gcp-v6/instance_template/README.md
+++ b/community/modules/internal/slurm-gcp-v6/instance_template/README.md
@@ -53,12 +53,11 @@
 | <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
 | <a name="input_region"></a> [region](#input\_region) | Region where the instance template should be created. | `string` | `null` | no |
-| <a name="input_resource_policies"></a> [resource\_policies](#input\_resource\_policies) | A list of self\_links of resource policies to attach to the instance.<br/>Currently a max of 1 resource policy is supported. | `list(string)` | `null` | no |
 | <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the instances. See<br/>'main.tf:local.service\_account' for the default. | <pre>object({<br/>    email  = string<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless<br/>enable\_shielded\_vm is 'true'.<br/>- enable\_integrity\_monitoring : Compare the most recent boot measurements to the<br/>  integrity policy baseline and return a pair of pass/fail results depending on<br/>  whether they match or not.<br/>- enable\_secure\_boot : Verify the digital signature of all boot components, and<br/>  halt the boot process if signature verification fails.<br/>- enable\_vtpm : Use a virtualized trusted platform module, which is a<br/>  specialized computer chip you can use to encrypt objects like keys and<br/>  certificates. | <pre>object({<br/>    enable_integrity_monitoring = bool<br/>    enable_secure_boot          = bool<br/>    enable_vtpm                 = bool<br/>  })</pre> | <pre>{<br/>  "enable_integrity_monitoring": true,<br/>  "enable_secure_boot": true,<br/>  "enable_vtpm": true<br/>}</pre> | no |
 | <a name="input_slurm_bucket_path"></a> [slurm\_bucket\_path](#input\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | `string` | n/a | yes |
 | <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes |
-| <a name="input_slurm_instance_role"></a> [slurm\_instance\_role](#input\_slurm\_instance\_role) | Slurm instance type. Must be one of: controller; login; compute; or null. | `string` | `null` | no |
+| <a name="input_slurm_instance_role"></a> [slurm\_instance\_role](#input\_slurm\_instance\_role) | Slurm instance type. Must be one of: controller; login; compute; or null. | `string` | n/a | yes |
 | <a name="input_source_image"></a> [source\_image](#input\_source\_image) | Source disk image. | `string` | `""` | no |
 | <a name="input_source_image_family"></a> [source\_image\_family](#input\_source\_image\_family) | Source image family. | `string` | `""` | no |
 | <a name="input_source_image_project"></a> [source\_image\_project](#input\_source\_image\_project) | Project where the source image comes from. If it is not provided, the provider project is used. | `string` | `""` | no |
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/main.tf b/community/modules/internal/slurm-gcp-v6/instance_template/main.tf
index 64c4caa0a6..70846ed020 100644
--- a/community/modules/internal/slurm-gcp-v6/instance_template/main.tf
+++ b/community/modules/internal/slurm-gcp-v6/instance_template/main.tf
@@ -29,7 +29,7 @@ locals {
         disk.disk_labels,
         {
           slurm_cluster_name  = var.slurm_cluster_name
-          slurm_instance_role = local.slurm_instance_role
+          slurm_instance_role = var.slurm_instance_role
         },
       )
     }
@@ -57,13 +57,8 @@ locals {
     : ""
   )
 
-  slurm_instance_role = var.slurm_instance_role != null ? lower(var.slurm_instance_role) : null
 
-  name_prefix = (
-    local.slurm_instance_role != null
-    ? "${var.slurm_cluster_name}-${local.slurm_instance_role}-${var.name_prefix}"
-    : "${var.slurm_cluster_name}-${var.name_prefix}"
-  )
+  name_prefix = "${var.slurm_cluster_name}-${var.slurm_instance_role}-${var.name_prefix}"
 
   total_egress_bandwidth_tier = var.bandwidth_tier == "tier_1_enabled" ? "TIER_1" : "DEFAULT"
 
@@ -123,7 +118,7 @@ module "instance_template" {
     var.labels,
     {
       slurm_cluster_name  = var.slurm_cluster_name
-      slurm_instance_role = local.slurm_instance_role
+      slurm_instance_role = var.slurm_instance_role
     },
   )
   instance_termination_action = var.termination_action
@@ -136,7 +131,7 @@ module "instance_template" {
       enable-oslogin      = upper(var.enable_oslogin)
       slurm_bucket_path   = var.slurm_bucket_path
       slurm_cluster_name  = var.slurm_cluster_name
-      slurm_instance_role = local.slurm_instance_role
+      slurm_instance_role = var.slurm_instance_role
     },
   )
 
@@ -152,10 +147,9 @@ module "instance_template" {
   disk_labels = merge(
     {
       slurm_cluster_name  = var.slurm_cluster_name
-      slurm_instance_role = local.slurm_instance_role
+      slurm_instance_role = var.slurm_instance_role
     },
     var.disk_labels,
   )
-  additional_disks  = local.additional_disks
-  resource_policies = var.resource_policies
+  additional_disks = local.additional_disks
 }
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/variables.tf b/community/modules/internal/slurm-gcp-v6/instance_template/variables.tf
index e8393e9654..d9ff5591d4 100644
--- a/community/modules/internal/slurm-gcp-v6/instance_template/variables.tf
+++ b/community/modules/internal/slurm-gcp-v6/instance_template/variables.tf
@@ -340,14 +340,10 @@ variable "additional_disks" {
 variable "slurm_instance_role" {
   type        = string
   description = "Slurm instance type. Must be one of: controller; login; compute; or null."
-  default     = null
 
   validation {
-    condition = (
-      var.slurm_instance_role == null
-      ? true
-    : contains(["controller", "login", "compute"], lower(var.slurm_instance_role)))
-    error_message = "Must be one of: controller; login; compute; or null."
+    condition     = contains(["controller", "login", "compute"], var.slurm_instance_role)
+    error_message = "Must be one of: controller; login; compute."
   }
 }
 
@@ -371,16 +367,3 @@ variable "slurm_bucket_path" {
   description = "GCS Bucket URI of Slurm cluster file storage."
   type        = string
 }
-
-variable "resource_policies" {
-  description = <<-EOD
-  A list of self_links of resource policies to attach to the instance.
-  Currently a max of 1 resource policy is supported.
-  EOD
-  type        = list(string)
-  default     = null
-  validation {
-    condition     = try(length(var.resource_policies) <= 1, true)
-    error_message = "Only one resource policy can be attached to the instance."
-  }
-}
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md b/community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md
index eff0d37a02..333886091b 100644
--- a/community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md
+++ b/community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md
@@ -41,7 +41,6 @@ No modules.
 | <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `string` | `"100"` | no |
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard | `string` | `"pd-standard"` | no |
 | <a name="input_enable_confidential_vm"></a> [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Whether to enable the Confidential VM configuration on the instance. Note that the instance image must support Confidential VMs. See https://cloud.google.com/compute/docs/images | `bool` | `false` | no |
-| <a name="input_enable_nested_virtualization"></a> [enable\_nested\_virtualization](#input\_enable\_nested\_virtualization) | Defines whether the instance should have nested virtualization enabled. | `bool` | `false` | no |
 | <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Whether to enable the Shielded VM configuration on the instance. Note that the instance image must support Shielded VMs. See https://cloud.google.com/compute/docs/images | `bool` | `false` | no |
 | <a name="input_gpu"></a> [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See https://cloud.google.com/compute/docs/gpus more details | <pre>object({<br/>    type  = string<br/>    count = number<br/>  })</pre> | `null` | no |
 | <a name="input_instance_termination_action"></a> [instance\_termination\_action](#input\_instance\_termination\_action) | Which action to take when Compute Engine preempts the VM. Value can be: 'STOP', 'DELETE'. The default value is 'STOP'.<br/>See https://cloud.google.com/compute/docs/instances/spot for more details. | `string` | `"STOP"` | no |
@@ -50,7 +49,7 @@ No modules.
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Machine type to create, e.g. n1-standard-1 | `string` | `"n1-standard-1"` | no |
 | <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no |
 | <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list: https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no |
-| <a name="input_name_prefix"></a> [name\_prefix](#input\_name\_prefix) | Name prefix for the instance template | `string` | `"default-instance-template"` | no |
+| <a name="input_name_prefix"></a> [name\_prefix](#input\_name\_prefix) | Name prefix for the instance template | `string` | n/a | yes |
 | <a name="input_network"></a> [network](#input\_network) | The name or self\_link of the network to attach this interface to. Use network attribute for Legacy or Auto subnetted networks and subnetwork for custom subnetted networks. | `string` | `""` | no |
 | <a name="input_network_ip"></a> [network\_ip](#input\_network\_ip) | Private IP address to assign to the instance if desired. | `string` | `""` | no |
 | <a name="input_nic_type"></a> [nic\_type](#input\_nic\_type) | The type of vNIC to be used on this interface. Possible values: GVNIC, VIRTIO\_NET. | `string` | `null` | no |
@@ -58,7 +57,6 @@ No modules.
 | <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Allow the instance to be preempted | `bool` | `false` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The GCP project ID | `string` | `null` | no |
 | <a name="input_region"></a> [region](#input\_region) | Region where the instance template should be created. | `string` | `null` | no |
-| <a name="input_resource_policies"></a> [resource\_policies](#input\_resource\_policies) | A list of self\_links of resource policies to attach to the instance.<br/>Currently a max of 1 resource policy is supported. | `list(string)` | `null` | no |
 | <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#service_account. | <pre>object({<br/>    email  = optional(string)<br/>    scopes = set(string)<br/>  })</pre> | n/a | yes |
 | <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Not used unless enable\_shielded\_vm is true. Shielded VM configuration for the instance. | <pre>object({<br/>    enable_secure_boot          = bool<br/>    enable_vtpm                 = bool<br/>    enable_integrity_monitoring = bool<br/>  })</pre> | <pre>{<br/>  "enable_integrity_monitoring": true,<br/>  "enable_secure_boot": true,<br/>  "enable_vtpm": true<br/>}</pre> | no |
 | <a name="input_source_image"></a> [source\_image](#input\_source\_image) | Source disk image. If neither source\_image nor source\_image\_family is specified, defaults to the latest public CentOS image. | `string` | `""` | no |
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf b/community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf
index be1fdd600e..eef402fafa 100644
--- a/community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf
+++ b/community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf
@@ -75,7 +75,6 @@ resource "google_compute_instance_template" "tpl" {
   metadata_startup_script = var.startup_script
   region                  = var.region
   min_cpu_platform        = var.min_cpu_platform
-  resource_policies       = var.resource_policies
 
   service_account {
     email  = coalesce(var.service_account.email, "${data.google_project.this.number}-compute@developer.gserviceaccount.com")
@@ -177,7 +176,7 @@ resource "google_compute_instance_template" "tpl" {
   }
 
   advanced_machine_features {
-    enable_nested_virtualization = var.enable_nested_virtualization
+    enable_nested_virtualization = false
     threads_per_core             = var.threads_per_core
   }
 
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf b/community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf
index 874fcf51bf..78a178038e 100644
--- a/community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf
+++ b/community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf
@@ -21,7 +21,6 @@ variable "project_id" {
 variable "name_prefix" {
   description = "Name prefix for the instance template"
   type        = string
-  default     = "default-instance-template"
 }
 
 variable "machine_type" {
@@ -96,12 +95,6 @@ variable "region" {
   default     = null
 }
 
-variable "enable_nested_virtualization" {
-  type        = bool
-  description = "Defines whether the instance should have nested virtualization enabled."
-  default     = false
-}
-
 variable "threads_per_core" {
   description = "The number of threads per physical core. To disable simultaneous multithreading (SMT) set this to 1."
   type        = number
@@ -348,17 +341,3 @@ EOF
   })
   default = null
 }
-
-
-variable "resource_policies" {
-  description = <<-EOD
-  A list of self_links of resource policies to attach to the instance.
-  Currently a max of 1 resource policy is supported.
-  EOD
-  type        = list(string)
-  default     = null
-  validation {
-    condition     = try(length(var.resource_policies) <= 1, true)
-    error_message = "Only one resource policy can be attached to the instance."
-  }
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
index 1f492a1402..cfb61787cb 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
@@ -59,16 +59,12 @@ module "slurm_login_instance" {
   source   = "../../internal/slurm-gcp-v6/instance"
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
-  access_config       = each.value.access_config
-  add_hostname_suffix = true
-  hostname            = "${local.slurm_cluster_name}-${each.key}"
-  slurm_instance_role = "login"
+  access_config = each.value.access_config
+  hostname      = "${local.slurm_cluster_name}-${each.key}"
 
-  project_id         = var.project_id
-  slurm_cluster_name = local.slurm_cluster_name
+  project_id = var.project_id
 
   instance_template = module.slurm_login_template[each.key].self_link
-  labels            = each.value.labels
   num_instances     = each.value.num_instances
 
   additional_networks = each.value.additional_networks

From 1b1615ebcadd012644b8d044f9cb486a6400e9a7 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Fri, 13 Dec 2024 08:43:16 +0000
Subject: [PATCH 016/140] Initial changes for fixing gke module provider

---
 examples/storage-gke.yaml                     | 29 +++++++++++++++++++
 .../gke-persistent-volume/README.md           |  5 ----
 .../file-system/gke-persistent-volume/main.tf | 17 -----------
 .../gke-persistent-volume/variables.tf        |  5 ----
 .../gke-persistent-volume/versions.tf         |  4 ---
 modules/management/kubectl-apply/README.md    |  7 -----
 modules/management/kubectl-apply/main.tf      | 25 +++-------------
 modules/management/kubectl-apply/providers.tf |  8 -----
 modules/management/kubectl-apply/variables.tf | 11 -------
 modules/management/kubectl-apply/versions.tf  |  8 -----
 modules/scheduler/gke-cluster/README.md       |  3 ++
 modules/scheduler/gke-cluster/main.tf         |  3 --
 modules/scheduler/gke-cluster/outputs.tf      | 15 ++++++++++
 .../pre-existing-gke-cluster/README.md        |  4 +++
 .../pre-existing-gke-cluster/main.tf          |  5 ++--
 .../pre-existing-gke-cluster/outputs.tf       | 15 ++++++++++
 pkg/config/expand.go                          | 18 ++++++++++++
 17 files changed, 90 insertions(+), 92 deletions(-)

diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml
index faa587b046..d810b2bad0 100644
--- a/examples/storage-gke.yaml
+++ b/examples/storage-gke.yaml
@@ -26,6 +26,35 @@ vars:
 
 deployment_groups:
 - group: primary
+  # terraform_providers:
+  #   kubectl:
+  #     source: "gavinbunney/kubectl"
+  #     version: ">= 1.7.0"
+  #     configuration:
+  #       host: $(gke_cluster.gke_cluster_endpoint)
+  #       cluster_ca_certificate: $(gke_cluster.cluster_ca_certificate)
+  #       token: $(gke_cluster.access_token)
+  #       load_config_file: false
+  #       apply_retry_count: 15
+  #       # host: "module.gke_cluster.gke_cluster_endpoint"
+  #       # cluster_ca_certificate: "module.gke_cluster.cluster_ca_certificate"
+  #       # token: "module.gke_cluster.access_token"
+  #       # load_config_file: false
+  #       # apply_retry_count: 15
+  #   google:
+  #     source: hashicorp/google
+  #     version: 6.12.0
+  #     configuration:
+  #       project: $(vars.project_id)
+  #       region: $(vars.region)
+  #       zone: $(vars.zone)
+  #   google-beta:
+  #     source: hashicorp/google-beta
+  #     version: 6.13.0
+  #     configuration:
+  #       project: $(vars.project_id)
+  #       region: $(vars.region)
+  #       zone: $(vars.zone)
   modules:
   - id: network1
     source: modules/network/vpc
diff --git a/modules/file-system/gke-persistent-volume/README.md b/modules/file-system/gke-persistent-volume/README.md
index f4d94d8c3b..f53eb3067b 100644
--- a/modules/file-system/gke-persistent-volume/README.md
+++ b/modules/file-system/gke-persistent-volume/README.md
@@ -121,7 +121,6 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | >= 4.42 |
 | <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
 | <a name="requirement_local"></a> [local](#requirement\_local) | >= 2.0.0 |
 
@@ -129,7 +128,6 @@ limitations under the License.
 
 | Name | Version |
 |------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | >= 4.42 |
 | <a name="provider_kubectl"></a> [kubectl](#provider\_kubectl) | >= 1.7.0 |
 | <a name="provider_local"></a> [local](#provider\_local) | >= 2.0.0 |
 
@@ -144,15 +142,12 @@ No modules.
 | [kubectl_manifest.pv](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
 | [kubectl_manifest.pvc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
 | [local_file.debug_file](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
-| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
-| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
 
 ## Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
-| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
 | <a name="input_filestore_id"></a> [filestore\_id](#input\_filestore\_id) | An identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `string` | `null` | no |
 | <a name="input_gcs_bucket_name"></a> [gcs\_bucket\_name](#input\_gcs\_bucket\_name) | The gcs bucket to be used with the persistent volume. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf
index 5b52bcc950..4812a799e2 100644
--- a/modules/file-system/gke-persistent-volume/main.tf
+++ b/modules/file-system/gke-persistent-volume/main.tf
@@ -77,9 +77,6 @@ locals {
       capacity = "${var.capacity_gb}Gi"
     }
   )
-
-  cluster_name     = split("/", var.cluster_id)[5]
-  cluster_location = split("/", var.cluster_id)[3]
 }
 
 resource "local_file" "debug_file" {
@@ -90,20 +87,6 @@ resource "local_file" "debug_file" {
   filename = "${path.root}/pv-pvc-debug-file-${local.filestore_name}.yaml"
 }
 
-data "google_container_cluster" "gke_cluster" {
-  name     = local.cluster_name
-  location = local.cluster_location
-}
-
-data "google_client_config" "default" {}
-
-provider "kubectl" {
-  host                   = "https://${data.google_container_cluster.gke_cluster.endpoint}"
-  cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
-  token                  = data.google_client_config.default.access_token
-  load_config_file       = false
-}
-
 resource "kubectl_manifest" "pv" {
   yaml_body = local.is_gcs ? local.gcs_pv_contents : local.filestore_pv_contents
 
diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf
index a72fa3857f..ebf411d593 100644
--- a/modules/file-system/gke-persistent-volume/variables.tf
+++ b/modules/file-system/gke-persistent-volume/variables.tf
@@ -14,11 +14,6 @@
  * limitations under the License.
 */
 
-variable "cluster_id" {
-  description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`"
-  type        = string
-}
-
 variable "network_storage" {
   description = "Network attached storage mount to be configured."
   type = object({
diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf
index 9aa0deab4c..d0a426f723 100644
--- a/modules/file-system/gke-persistent-volume/versions.tf
+++ b/modules/file-system/gke-persistent-volume/versions.tf
@@ -15,10 +15,6 @@
 terraform {
   required_version = ">= 1.0"
   required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = ">= 4.42"
-    }
     kubectl = {
       source  = "gavinbunney/kubectl"
       version = ">= 1.7.0"
diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index 360929da22..02812a2d96 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -101,15 +101,12 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | > 5.0 |
 | <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
-| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
 
 ## Providers
 
 | Name | Version |
 |------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | > 5.0 |
 | <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |
 
 ## Modules
@@ -127,18 +124,14 @@ limitations under the License.
 |------|------|
 | [terraform_data.jobset_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
 | [terraform_data.kueue_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
-| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
-| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
 
 ## Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
-| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
 | <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/>    install = optional(bool, false)<br/>    version = optional(string, "v0.5.2")<br/>  })</pre> | `{}` | no |
 | <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. | <pre>object({<br/>    install              = optional(bool, false)<br/>    version              = optional(string, "v0.8.1")<br/>    config_path          = optional(string, null)<br/>    config_template_vars = optional(map(any), null)<br/>  })</pre> | `{}` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
index 5663e01580..1f02677ff0 100644
--- a/modules/management/kubectl-apply/main.tf
+++ b/modules/management/kubectl-apply/main.tf
@@ -15,11 +15,6 @@
   */
 
 locals {
-  cluster_id_parts = split("/", var.cluster_id)
-  cluster_name     = local.cluster_id_parts[5]
-  cluster_location = local.cluster_id_parts[3]
-  project_id       = var.project_id != null ? var.project_id : local.cluster_id_parts[1]
-
   apply_manifests_map = tomap({
     for index, manifest in var.apply_manifests : index => manifest
   })
@@ -30,14 +25,6 @@ locals {
   jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, ""))
 }
 
-data "google_container_cluster" "gke_cluster" {
-  project  = local.project_id
-  name     = local.cluster_name
-  location = local.cluster_location
-}
-
-data "google_client_config" "default" {}
-
 module "kubectl_apply_manifests" {
   for_each = local.apply_manifests_map
   source   = "./kubectl"
@@ -49,8 +36,7 @@ module "kubectl_apply_manifests" {
   wait_for_rollout  = each.value.wait_for_rollout
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -60,8 +46,7 @@ module "install_kueue" {
   server_side_apply = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -71,8 +56,7 @@ module "install_jobset" {
   server_side_apply = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -86,7 +70,6 @@ module "configure_kueue" {
   wait_for_rollout  = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
diff --git a/modules/management/kubectl-apply/providers.tf b/modules/management/kubectl-apply/providers.tf
index 74d157b93b..d5577975f3 100644
--- a/modules/management/kubectl-apply/providers.tf
+++ b/modules/management/kubectl-apply/providers.tf
@@ -14,14 +14,6 @@
   * limitations under the License.
   */
 
-provider "kubectl" {
-  host                   = "https://${data.google_container_cluster.gke_cluster.endpoint}"
-  token                  = data.google_client_config.default.access_token
-  cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
-  load_config_file       = false
-  apply_retry_count      = 15 # Terraform may apply resources in parallel, leading to potential dependency issues. This retry mechanism ensures that if a resource's dependencies aren't ready, Terraform will attempt to apply it again.
-}
-
 provider "http" {
   alias = "h"
 }
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index c493332e7c..7a4f54a0a9 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -37,17 +37,6 @@ resource "terraform_data" "jobset_validations" {
   }
 }
 
-variable "project_id" {
-  description = "The project ID that hosts the gke cluster."
-  type        = string
-}
-
-variable "cluster_id" {
-  description = "An identifier for the gke cluster resource with format projects/<project_id>/locations/<region>/clusters/<name>."
-  type        = string
-  nullable    = false
-}
-
 variable "apply_manifests" {
   description = "A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md)."
   type = list(object({
diff --git a/modules/management/kubectl-apply/versions.tf b/modules/management/kubectl-apply/versions.tf
index 227838747c..774cddd7ff 100644
--- a/modules/management/kubectl-apply/versions.tf
+++ b/modules/management/kubectl-apply/versions.tf
@@ -16,14 +16,6 @@
 
 terraform {
   required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = "> 5.0"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.7.0"
-    }
     http = {
       source  = "hashicorp/http"
       version = "~> 3.0"
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 675039add6..1ac653f698 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -197,7 +197,10 @@ limitations under the License.
 
 | Name | Description |
 |------|-------------|
+| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
+| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
+| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
 | <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 | <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 5b416a85bb..55188acb6b 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -336,9 +336,6 @@ module "workload_identity" {
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
-  cluster_id = google_container_cluster.gke_cluster.id
-  project_id = var.project_id
-
   apply_manifests = flatten([
     for idx, network_info in var.additional_networks : [
       {
diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf
index 28e00171ff..9ffd370664 100644
--- a/modules/scheduler/gke-cluster/outputs.tf
+++ b/modules/scheduler/gke-cluster/outputs.tf
@@ -79,3 +79,18 @@ output "gke_version" {
   description = "GKE cluster's version."
   value       = google_container_cluster.gke_cluster.master_version
 }
+
+output "gke_cluster_endpoint" {
+  description = "GKE cluster endpoint."
+  value       = "https://${google_container_cluster.gke_cluster.endpoint}"
+}
+
+output "cluster_ca_certificate" {
+  description = "GKE cluster CA certificate."
+  value       = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
+}
+
+output "access_token" {
+  description = "Google client config access token."
+  value       = data.google_client_config.default.access_token
+}
diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md
index aaac5a547f..4faed93294 100644
--- a/modules/scheduler/pre-existing-gke-cluster/README.md
+++ b/modules/scheduler/pre-existing-gke-cluster/README.md
@@ -94,6 +94,7 @@ limitations under the License.
 
 | Name | Type |
 |------|------|
+| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
 | [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
 
 ## Inputs
@@ -110,7 +111,10 @@ limitations under the License.
 
 | Name | Description |
 |------|-------------|
+| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
+| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
+| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
 | <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf
index 926d2be100..e90c8877ed 100644
--- a/modules/scheduler/pre-existing-gke-cluster/main.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/main.tf
@@ -60,11 +60,10 @@ locals {
   ])
 }
 
+data "google_client_config" "default" {}
+
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
-  cluster_id = data.google_container_cluster.existing_gke_cluster.id
-  project_id = var.project_id
-
   apply_manifests = concat(local.apply_manifests_non_rdma_networks, local.apply_manifests_rdma_networks)
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
index 8884ee30b0..cab4bf0b22 100644
--- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
@@ -31,3 +31,18 @@ output "gke_version" {
   description = "GKE cluster's version."
   value       = data.google_container_cluster.existing_gke_cluster.master_version
 }
+
+output "gke_cluster_endpoint" {
+  description = "GKE cluster endpoint."
+  value       = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}"
+}
+
+output "cluster_ca_certificate" {
+  description = "GKE cluster CA certificate."
+  value       = base64decode(data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate)
+}
+
+output "access_token" {
+  description = "Google client config access token."
+  value       = data.google_client_config.default.access_token
+}
diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index ae5c30a328..ffabf3b6ce 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -207,6 +207,23 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 			Configuration: gglConf}}
 }
 
+// func getDefaultKubectlProviders() map[string]TerraformProvider {
+// 	kubectlConf := Dict{}
+// 	for s, v := range map[string]string{
+// 		"cluster_ca_certificate": "cluster_ca_certificate",
+// 		"host":                   "gke_cluster_endpoint",
+// 		"token":                  "access_token"} {
+// 		kubectlConf = kubectlConf.With(s, ModuleRef("gke_cluster", v).AsValue())
+// 	}
+// 	kubectlConf = kubectlConf.With("apply_retry_count", cty.NumberIntVal(15))
+// 	kubectlConf = kubectlConf.With("load_config_file", cty.BoolVal(false))
+// 	return map[string]TerraformProvider{
+// 		"kubectl": {
+// 			Source:        "gavinbunney/kubectl",
+// 			Version:       ">= 1.7.0",
+// 			Configuration: kubectlConf}}
+// }
+
 func (bp Blueprint) expandProviders(grp *Group) {
 	// 1. DEFAULT: use TerraformProviders provider dictionary (if supplied)
 	// 2. If top-level TerraformProviders is defined, insert that
@@ -216,6 +233,7 @@ func (bp Blueprint) expandProviders(grp *Group) {
 	pv := &grp.TerraformProviders
 	if defaults == nil {
 		defaults = getDefaultGoogleProviders(bp)
+		// maps.Copy(defaults, getDefaultKubectlProviders())
 	}
 	if (*pv) == nil {
 		(*pv) = maps.Clone(defaults)

From ef89293bc4431289e6fd584952d4e524374270d4 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Fri, 13 Dec 2024 11:30:05 +0000
Subject: [PATCH 017/140] Further refined changes for gke root module

---
 examples/storage-gke.yaml             | 53 ++++++++++++---------------
 modules/compute/gke-node-pool/main.tf |  3 --
 pkg/config/config.go                  |  9 +++--
 pkg/config/expand.go                  | 18 ---------
 pkg/config/path.go                    | 22 +++++------
 5 files changed, 40 insertions(+), 65 deletions(-)

diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml
index d810b2bad0..e2452e83aa 100644
--- a/examples/storage-gke.yaml
+++ b/examples/storage-gke.yaml
@@ -26,35 +26,30 @@ vars:
 
 deployment_groups:
 - group: primary
-  # terraform_providers:
-  #   kubectl:
-  #     source: "gavinbunney/kubectl"
-  #     version: ">= 1.7.0"
-  #     configuration:
-  #       host: $(gke_cluster.gke_cluster_endpoint)
-  #       cluster_ca_certificate: $(gke_cluster.cluster_ca_certificate)
-  #       token: $(gke_cluster.access_token)
-  #       load_config_file: false
-  #       apply_retry_count: 15
-  #       # host: "module.gke_cluster.gke_cluster_endpoint"
-  #       # cluster_ca_certificate: "module.gke_cluster.cluster_ca_certificate"
-  #       # token: "module.gke_cluster.access_token"
-  #       # load_config_file: false
-  #       # apply_retry_count: 15
-  #   google:
-  #     source: hashicorp/google
-  #     version: 6.12.0
-  #     configuration:
-  #       project: $(vars.project_id)
-  #       region: $(vars.region)
-  #       zone: $(vars.zone)
-  #   google-beta:
-  #     source: hashicorp/google-beta
-  #     version: 6.13.0
-  #     configuration:
-  #       project: $(vars.project_id)
-  #       region: $(vars.region)
-  #       zone: $(vars.zone)
+  terraform_providers:
+    kubectl:
+      source: "gavinbunney/kubectl"
+      version: ">= 1.7.0"
+      configuration:
+        host: $(gke_cluster.gke_cluster_endpoint)
+        cluster_ca_certificate: $(gke_cluster.cluster_ca_certificate)
+        token: $(gke_cluster.access_token)
+        load_config_file: false
+        apply_retry_count: 15
+    google:
+      source: hashicorp/google
+      version: 6.12.0
+      configuration:
+        project: $(vars.project_id)
+        region: $(vars.region)
+        zone: $(vars.zone)
+    google-beta:
+      source: hashicorp/google-beta
+      version: 6.13.0
+      configuration:
+        project: $(vars.project_id)
+        region: $(vars.region)
+        zone: $(vars.zone)
   modules:
   - id: network1
     source: modules/network/vpc
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f1999cbd0b..5d4bf02fb2 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -355,9 +355,6 @@ resource "null_resource" "enable_tcpxo_in_workload" {
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
-  cluster_id = var.cluster_id
-  project_id = var.project_id
-
   apply_manifests = flatten([
     for manifest in local.gpu_direct_setting.gpu_direct_manifests : [
       {
diff --git a/pkg/config/config.go b/pkg/config/config.go
index df2192291f..099973f416 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -698,14 +698,15 @@ func (bp *Blueprint) checkToolkitModulesUrlAndVersion() error {
 func (bp *Blueprint) checkReferences() error {
 	errs := Errors{}
 	bp.visitDicts(func(dp dictPath, d *Dict) {
-		isModSettings := IsModuleSettingsPath(dp)
+		// isModSettings := IsModuleSettingsPath(dp)
 		for k, v := range d.Items() {
 			for ref, rp := range valueReferences(v) {
 				path := dp.Dot(k).Cty(rp)
+				// fmt.Println("GlobalVar", ref.GlobalVar, "Name", ref.Name, "Module", ref.Module, "rp", rp, "path", path)
 				if !ref.GlobalVar {
-					if !isModSettings {
-						errs.At(path, fmt.Errorf("module output %q can only be referenced in other module settings", ref))
-					}
+					// if !isModSettings {
+					// 	errs.At(path, fmt.Errorf("module output %q can only be referenced in other module settings", ref))
+					// }
 					// module to module references are checked by validateModuleSettingReferences later
 					return
 				}
diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index ffabf3b6ce..ae5c30a328 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -207,23 +207,6 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 			Configuration: gglConf}}
 }
 
-// func getDefaultKubectlProviders() map[string]TerraformProvider {
-// 	kubectlConf := Dict{}
-// 	for s, v := range map[string]string{
-// 		"cluster_ca_certificate": "cluster_ca_certificate",
-// 		"host":                   "gke_cluster_endpoint",
-// 		"token":                  "access_token"} {
-// 		kubectlConf = kubectlConf.With(s, ModuleRef("gke_cluster", v).AsValue())
-// 	}
-// 	kubectlConf = kubectlConf.With("apply_retry_count", cty.NumberIntVal(15))
-// 	kubectlConf = kubectlConf.With("load_config_file", cty.BoolVal(false))
-// 	return map[string]TerraformProvider{
-// 		"kubectl": {
-// 			Source:        "gavinbunney/kubectl",
-// 			Version:       ">= 1.7.0",
-// 			Configuration: kubectlConf}}
-// }
-
 func (bp Blueprint) expandProviders(grp *Group) {
 	// 1. DEFAULT: use TerraformProviders provider dictionary (if supplied)
 	// 2. If top-level TerraformProviders is defined, insert that
@@ -233,7 +216,6 @@ func (bp Blueprint) expandProviders(grp *Group) {
 	pv := &grp.TerraformProviders
 	if defaults == nil {
 		defaults = getDefaultGoogleProviders(bp)
-		// maps.Copy(defaults, getDefaultKubectlProviders())
 	}
 	if (*pv) == nil {
 		(*pv) = maps.Clone(defaults)
diff --git a/pkg/config/path.go b/pkg/config/path.go
index 7d84f449d4..d0869d4bd2 100644
--- a/pkg/config/path.go
+++ b/pkg/config/path.go
@@ -194,14 +194,14 @@ func init() {
 	initPath(&Root, nil, "")
 }
 
-func IsModuleSettingsPath(p Path) bool {
-	parent := p.Parent()
-	if parent == nil {
-		return false
-	}
-	mp, ok := parent.(*ModulePath)
-	if !ok {
-		return false
-	}
-	return p == mp.Settings
-}
+// func IsModuleSettingsPath(p Path) bool {
+// 	parent := p.Parent()
+// 	if parent == nil {
+// 		return false
+// 	}
+// 	mp, ok := parent.(*ModulePath)
+// 	if !ok {
+// 		return false
+// 	}
+// 	return p == mp.Settings
+// }

From 049376da2e85553306ae76806f0b6d0263bfc69e Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Fri, 13 Dec 2024 19:04:20 +0000
Subject: [PATCH 018/140] Fix misusage of `groupby_unsorted`

---
 .../modules/slurm_files/scripts/slurmsync.py              | 2 +-
 .../modules/slurm_files/scripts/suspend.py                | 5 ++---
 .../modules/slurm_files/scripts/util.py                   | 8 +++-----
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index b26cdfcd5a..8b67365e68 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -387,7 +387,7 @@ def sync_slurm():
     slurm_nodes = set(lookup().slurm_nodes().keys())
     log.debug(f"reconciling {len(compute_instances)} GCP instances and {len(slurm_nodes)} Slurm nodes.")
 
-    for action, nodes in util.groupby_unsorted(compute_instances | slurm_nodes, get_node_action):
+    for action, nodes in util.groupby_unsorted(list(compute_instances | slurm_nodes), get_node_action):
         action.apply(list(nodes))
 
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
index dc901b6aba..f01013e1a2 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
@@ -98,9 +98,8 @@ def delete_instances(instances):
 
     log.info(f"delete {len(valid)} instances ({to_hostlist(valid)})")
     done, failed = batch_execute(requests)
-    if failed:
-        for err, nodes in groupby_unsorted(lambda n: failed[n][1], failed.keys()):
-            log.error(f"instances failed to delete: {err} ({to_hostlist(nodes)})")
+    for node, (_, err) in failed.items():
+        log.error(f"instance {node} failed to delete: {err}")
     wait_for_operations(done.values())
     # TODO do we need to check each operation for success? That is a lot more API calls
     log.info(f"deleted {len(done)} instances {to_hostlist(done.keys())}")
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 47d8b6c771..017443002f 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Iterable, List, Tuple, Optional, Any, Dict
+from typing import Iterable, List, Tuple, Optional, Any, Dict, Sequence
 import argparse
 import base64
-import collections
 from dataclasses import dataclass
 from datetime import timedelta, datetime
 import hashlib
@@ -36,7 +35,7 @@
 import sys
 import tempfile
 from enum import Enum
-from collections import defaultdict, namedtuple
+from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import contextmanager
 from functools import lru_cache, reduce, wraps
@@ -746,8 +745,7 @@ def chunked(iterable, n=API_REQ_LIMIT):
             return
         yield chunk
 
-
-def groupby_unsorted(seq, key):
+def groupby_unsorted(seq: Sequence[Any], key):
     indices = defaultdict(list)
     for i, el in enumerate(seq):
         indices[key(el)].append(i)

From 03413f8e82368501885f116f36a484fe4f91d569 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Sat, 14 Dec 2024 08:41:09 +0000
Subject: [PATCH 019/140] Append kubectl provider in root module for blueprint
 with gke cluster module

---
 examples/storage-gke.yaml                     | 24 --------------
 .../gke-persistent-volume/README.md           |  1 +
 .../file-system/gke-persistent-volume/main.tf |  2 +-
 .../gke-persistent-volume/variables.tf        |  5 +++
 modules/scheduler/gke-cluster/README.md       |  2 +-
 modules/scheduler/gke-cluster/outputs.tf      |  2 +-
 .../pre-existing-gke-cluster/README.md        |  2 +-
 .../pre-existing-gke-cluster/outputs.tf       |  2 +-
 pkg/config/config.go                          |  9 +++--
 pkg/config/expand.go                          | 33 +++++++++++++++++++
 pkg/config/path.go                            | 22 ++++++-------
 11 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml
index e2452e83aa..faa587b046 100644
--- a/examples/storage-gke.yaml
+++ b/examples/storage-gke.yaml
@@ -26,30 +26,6 @@ vars:
 
 deployment_groups:
 - group: primary
-  terraform_providers:
-    kubectl:
-      source: "gavinbunney/kubectl"
-      version: ">= 1.7.0"
-      configuration:
-        host: $(gke_cluster.gke_cluster_endpoint)
-        cluster_ca_certificate: $(gke_cluster.cluster_ca_certificate)
-        token: $(gke_cluster.access_token)
-        load_config_file: false
-        apply_retry_count: 15
-    google:
-      source: hashicorp/google
-      version: 6.12.0
-      configuration:
-        project: $(vars.project_id)
-        region: $(vars.region)
-        zone: $(vars.zone)
-    google-beta:
-      source: hashicorp/google-beta
-      version: 6.13.0
-      configuration:
-        project: $(vars.project_id)
-        region: $(vars.region)
-        zone: $(vars.zone)
   modules:
   - id: network1
     source: modules/network/vpc
diff --git a/modules/file-system/gke-persistent-volume/README.md b/modules/file-system/gke-persistent-volume/README.md
index f53eb3067b..23bce2de8d 100644
--- a/modules/file-system/gke-persistent-volume/README.md
+++ b/modules/file-system/gke-persistent-volume/README.md
@@ -150,6 +150,7 @@ No modules.
 | <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
 | <a name="input_filestore_id"></a> [filestore\_id](#input\_filestore\_id) | An identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `string` | `null` | no |
 | <a name="input_gcs_bucket_name"></a> [gcs\_bucket\_name](#input\_gcs\_bucket\_name) | The gcs bucket to be used with the persistent volume. | `string` | `null` | no |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | n/a | yes |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | Network attached storage mount to be configured. | <pre>object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  })</pre> | n/a | yes |
 
diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf
index 4812a799e2..c9ebaa6010 100644
--- a/modules/file-system/gke-persistent-volume/main.tf
+++ b/modules/file-system/gke-persistent-volume/main.tf
@@ -92,7 +92,7 @@ resource "kubectl_manifest" "pv" {
 
   lifecycle {
     precondition {
-      condition     = (var.gcs_bucket_name != null) != (var.filestore_id != null)
+      condition     = var.gke_cluster_exists && (var.gcs_bucket_name != null) != (var.filestore_id != null)
       error_message = "Either gcs_bucket_name or filestore_id must be set."
     }
   }
diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf
index ebf411d593..88ff9c36b8 100644
--- a/modules/file-system/gke-persistent-volume/variables.tf
+++ b/modules/file-system/gke-persistent-volume/variables.tf
@@ -14,6 +14,11 @@
  * limitations under the License.
 */
 
+variable "gke_cluster_exists" {
+  description = "A static flag that signals to modules that a cluster has been created."
+  type        = bool
+}
+
 variable "network_storage" {
   description = "Network attached storage mount to be configured."
   type = object({
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 1ac653f698..e5a869156d 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -200,9 +200,9 @@ limitations under the License.
 | <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
 | <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
-| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
 | <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
+| <a name="output_host_endpoint"></a> [host\_endpoint](#output\_host\_endpoint) | GKE cluster endpoint. |
 | <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
 | <a name="output_k8s_service_account_name"></a> [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf
index 9ffd370664..087462049a 100644
--- a/modules/scheduler/gke-cluster/outputs.tf
+++ b/modules/scheduler/gke-cluster/outputs.tf
@@ -80,7 +80,7 @@ output "gke_version" {
   value       = google_container_cluster.gke_cluster.master_version
 }
 
-output "gke_cluster_endpoint" {
+output "host_endpoint" {
   description = "GKE cluster endpoint."
   value       = "https://${google_container_cluster.gke_cluster.endpoint}"
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md
index 4faed93294..c0715c472d 100644
--- a/modules/scheduler/pre-existing-gke-cluster/README.md
+++ b/modules/scheduler/pre-existing-gke-cluster/README.md
@@ -114,7 +114,7 @@ limitations under the License.
 | <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
 | <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
-| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
 | <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
+| <a name="output_host_endpoint"></a> [host\_endpoint](#output\_host\_endpoint) | GKE cluster endpoint. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
index cab4bf0b22..880928d21b 100644
--- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
@@ -32,7 +32,7 @@ output "gke_version" {
   value       = data.google_container_cluster.existing_gke_cluster.master_version
 }
 
-output "gke_cluster_endpoint" {
+output "host_endpoint" {
   description = "GKE cluster endpoint."
   value       = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}"
 }
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 099973f416..df2192291f 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -698,15 +698,14 @@ func (bp *Blueprint) checkToolkitModulesUrlAndVersion() error {
 func (bp *Blueprint) checkReferences() error {
 	errs := Errors{}
 	bp.visitDicts(func(dp dictPath, d *Dict) {
-		// isModSettings := IsModuleSettingsPath(dp)
+		isModSettings := IsModuleSettingsPath(dp)
 		for k, v := range d.Items() {
 			for ref, rp := range valueReferences(v) {
 				path := dp.Dot(k).Cty(rp)
-				// fmt.Println("GlobalVar", ref.GlobalVar, "Name", ref.Name, "Module", ref.Module, "rp", rp, "path", path)
 				if !ref.GlobalVar {
-					// if !isModSettings {
-					// 	errs.At(path, fmt.Errorf("module output %q can only be referenced in other module settings", ref))
-					// }
+					if !isModSettings {
+						errs.At(path, fmt.Errorf("module output %q can only be referenced in other module settings", ref))
+					}
 					// module to module references are checked by validateModuleSettingReferences later
 					return
 				}
diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index ae5c30a328..6e6d722c51 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -17,6 +17,7 @@ package config
 import (
 	"errors"
 	"fmt"
+	"strings"
 
 	"hpc-toolkit/pkg/modulereader"
 	"hpc-toolkit/pkg/sourcereader"
@@ -186,6 +187,34 @@ func (bp Blueprint) expandBackend(grp *Group) {
 	}
 }
 
+func kubectlProviderRequiredModules(grp *Group) []Module {
+	mods := []Module{}
+	for _, mod := range grp.Modules {
+		if strings.Contains(mod.Source, "gke-cluster") || strings.Contains(mod.Source, "pre-existing-gke-cluster") {
+			mods = append(mods, mod)
+		}
+	}
+	return mods
+}
+
+func getModuleKubectlProviders(mod Module) map[string]TerraformProvider {
+	kubectlConf := Dict{}
+	for s, v := range map[string]string{
+		"cluster_ca_certificate": "cluster_ca_certificate",
+		"host":                   "host_endpoint",
+		"token":                  "access_token"} {
+		kubectlConf = kubectlConf.With(s, ModuleRef(mod.ID, v).AsValue())
+	}
+	// kubectlConf = kubectlConf.With("alias", cty.StringVal(string(mod.ID)))
+	kubectlConf = kubectlConf.With("apply_retry_count", cty.NumberIntVal(15))
+	kubectlConf = kubectlConf.With("load_config_file", cty.BoolVal(false))
+	return map[string]TerraformProvider{
+		"kubectl": {
+			Source:        "gavinbunney/kubectl",
+			Version:       ">= 1.7.0",
+			Configuration: kubectlConf}}
+}
+
 func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 	gglConf := Dict{}
 	for s, v := range map[string]string{
@@ -220,6 +249,10 @@ func (bp Blueprint) expandProviders(grp *Group) {
 	if (*pv) == nil {
 		(*pv) = maps.Clone(defaults)
 	}
+	mods := kubectlProviderRequiredModules(grp)
+	for _, mod := range mods {
+		maps.Copy((*pv), getModuleKubectlProviders(mod))
+	}
 }
 
 func getModuleInputMap(inputs []modulereader.VarInfo) map[string]cty.Type {
diff --git a/pkg/config/path.go b/pkg/config/path.go
index d0869d4bd2..7d84f449d4 100644
--- a/pkg/config/path.go
+++ b/pkg/config/path.go
@@ -194,14 +194,14 @@ func init() {
 	initPath(&Root, nil, "")
 }
 
-// func IsModuleSettingsPath(p Path) bool {
-// 	parent := p.Parent()
-// 	if parent == nil {
-// 		return false
-// 	}
-// 	mp, ok := parent.(*ModulePath)
-// 	if !ok {
-// 		return false
-// 	}
-// 	return p == mp.Settings
-// }
+func IsModuleSettingsPath(p Path) bool {
+	parent := p.Parent()
+	if parent == nil {
+		return false
+	}
+	mp, ok := parent.(*ModulePath)
+	if !ok {
+		return false
+	}
+	return p == mp.Settings
+}

From da63370ae6d5b1dc80fbe4e552c24d5463c48fd4 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Sat, 14 Dec 2024 08:53:42 +0000
Subject: [PATCH 020/140] Append kubectl provider in root module for blueprint
 with gke cluster module

---
 modules/file-system/gke-persistent-volume/main.tf | 2 +-
 modules/management/kubectl-apply/README.md        | 1 +
 modules/management/kubectl-apply/main.tf          | 3 +++
 modules/management/kubectl-apply/variables.tf     | 5 +++++
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf
index c9ebaa6010..df8b4733d8 100644
--- a/modules/file-system/gke-persistent-volume/main.tf
+++ b/modules/file-system/gke-persistent-volume/main.tf
@@ -93,7 +93,7 @@ resource "kubectl_manifest" "pv" {
   lifecycle {
     precondition {
       condition     = var.gke_cluster_exists && (var.gcs_bucket_name != null) != (var.filestore_id != null)
-      error_message = "Either gcs_bucket_name or filestore_id must be set."
+      error_message = "GKE cluster should exists and either gcs_bucket_name or filestore_id must be set."
     }
   }
 }
diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index 02812a2d96..64f254d11b 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -130,6 +130,7 @@ limitations under the License.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | n/a | yes |
 | <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/>    install = optional(bool, false)<br/>    version = optional(string, "v0.5.2")<br/>  })</pre> | `{}` | no |
 | <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. | <pre>object({<br/>    install              = optional(bool, false)<br/>    version              = optional(string, "v0.8.1")<br/>    config_path          = optional(string, null)<br/>    config_template_vars = optional(map(any), null)<br/>  })</pre> | `{}` | no |
 
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
index 1f02677ff0..85f5f4ba03 100644
--- a/modules/management/kubectl-apply/main.tf
+++ b/modules/management/kubectl-apply/main.tf
@@ -34,6 +34,7 @@ module "kubectl_apply_manifests" {
   template_vars     = each.value.template_vars
   server_side_apply = each.value.server_side_apply
   wait_for_rollout  = each.value.wait_for_rollout
+  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
@@ -44,6 +45,7 @@ module "install_kueue" {
   source            = "./kubectl"
   source_path       = local.install_kueue ? local.kueue_install_source : null
   server_side_apply = true
+  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
@@ -54,6 +56,7 @@ module "install_jobset" {
   source            = "./kubectl"
   source_path       = local.install_jobset ? local.jobset_install_source : null
   server_side_apply = true
+  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index 7a4f54a0a9..cb1cc6b690 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -37,6 +37,11 @@ resource "terraform_data" "jobset_validations" {
   }
 }
 
+variable "gke_cluster_exists" {
+  description = "A static flag that signals to modules that a cluster has been created."
+  type        = bool
+}
+
 variable "apply_manifests" {
   description = "A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md)."
   type = list(object({

From 0e1d96e2987041017d60c0c895813cce9773c58b Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Sat, 14 Dec 2024 12:58:44 +0000
Subject: [PATCH 021/140] Further refined changes

---
 .../modules/compute/gke-topology-scheduler/README.md |  3 +--
 .../modules/compute/gke-topology-scheduler/main.tf   |  4 +---
 .../compute/gke-topology-scheduler/variables.tf      | 12 ++++--------
 modules/file-system/gke-storage/README.md            |  3 +--
 modules/file-system/gke-storage/main.tf              |  4 +---
 modules/file-system/gke-storage/variables.tf         | 12 ++++--------
 modules/management/kubectl-apply/README.md           |  2 +-
 modules/management/kubectl-apply/main.tf             |  8 ++++----
 modules/management/kubectl-apply/variables.tf        |  1 +
 modules/scheduler/gke-cluster/main.tf                |  2 ++
 modules/scheduler/pre-existing-gke-cluster/main.tf   |  2 ++
 pkg/config/expand.go                                 | 10 ++++------
 12 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md
index 5aaa4fca98..8d5b42913e 100644
--- a/community/modules/compute/gke-topology-scheduler/README.md
+++ b/community/modules/compute/gke-topology-scheduler/README.md
@@ -45,8 +45,7 @@ No resources.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
 
 ## Outputs
 
diff --git a/community/modules/compute/gke-topology-scheduler/main.tf b/community/modules/compute/gke-topology-scheduler/main.tf
index 677595632b..1c2b658668 100644
--- a/community/modules/compute/gke-topology-scheduler/main.tf
+++ b/community/modules/compute/gke-topology-scheduler/main.tf
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 module "kubectl_apply" {
+  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../../../modules/management/kubectl-apply"
 
-  cluster_id = var.cluster_id
-  project_id = var.project_id
-
   apply_manifests = [
     { source = "${path.module}/manifests/topology-scheduler-scripts.yaml" },
     { source = "${path.module}/manifests/service-account.yaml" },
diff --git a/community/modules/compute/gke-topology-scheduler/variables.tf b/community/modules/compute/gke-topology-scheduler/variables.tf
index 0766091223..2fcbb93d58 100644
--- a/community/modules/compute/gke-topology-scheduler/variables.tf
+++ b/community/modules/compute/gke-topology-scheduler/variables.tf
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-variable "project_id" {
-  description = "The project ID to host the cluster in."
-  type        = string
-}
-
-variable "cluster_id" {
-  description = "projects/{{project}}/locations/{{location}}/clusters/{{cluster}}"
-  type        = string
+variable "gke_cluster_exists" {
+  description = "A static flag that signals to modules that a cluster has been created."
+  type        = bool
+  default     = false
 }
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index 17c718aa37..f1b3c2884e 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -109,11 +109,10 @@ No resources.
 |------|-------------|------|---------|:--------:|
 | <a name="input_access_mode"></a> [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)<br/>Valid access modes:<br/>- ReadWriteOnce<br/>- ReadOnlyMany<br/>- ReadWriteMany<br/>- ReadWriteOncePod | `string` | n/a | yes |
 | <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
-| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no |
 | <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.<br/>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br/>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_pv_mount_path"></a> [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no |
 | <a name="input_pvc_count"></a> [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no |
 | <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
diff --git a/modules/file-system/gke-storage/main.tf b/modules/file-system/gke-storage/main.tf
index 18f85fa779..bb738162e5 100644
--- a/modules/file-system/gke-storage/main.tf
+++ b/modules/file-system/gke-storage/main.tf
@@ -37,11 +37,9 @@ check "private_vpc_connection_peering" {
 }
 
 module "kubectl_apply" {
+  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../management/kubectl-apply"
 
-  cluster_id = var.cluster_id
-  project_id = var.project_id
-
   # count = var.pvc_count
   apply_manifests = flatten(
     [
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
index 9ad3b839d8..9efbe6082c 100644
--- a/modules/file-system/gke-storage/variables.tf
+++ b/modules/file-system/gke-storage/variables.tf
@@ -14,14 +14,10 @@
  * limitations under the License.
 */
 
-variable "project_id" {
-  description = "The project ID to host the cluster in."
-  type        = string
-}
-
-variable "cluster_id" {
-  description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`"
-  type        = string
+variable "gke_cluster_exists" {
+  description = "A static flag that signals to modules that a cluster has been created."
+  type        = bool
+  default     = false
 }
 
 variable "labels" {
diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index 64f254d11b..47f0076618 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -130,7 +130,7 @@ limitations under the License.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
-| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | n/a | yes |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
 | <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/>    install = optional(bool, false)<br/>    version = optional(string, "v0.5.2")<br/>  })</pre> | `{}` | no |
 | <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. | <pre>object({<br/>    install              = optional(bool, false)<br/>    version              = optional(string, "v0.8.1")<br/>    config_path          = optional(string, null)<br/>    config_template_vars = optional(map(any), null)<br/>  })</pre> | `{}` | no |
 
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
index 85f5f4ba03..cc1abd05f0 100644
--- a/modules/management/kubectl-apply/main.tf
+++ b/modules/management/kubectl-apply/main.tf
@@ -26,7 +26,7 @@ locals {
 }
 
 module "kubectl_apply_manifests" {
-  for_each = local.apply_manifests_map
+  for_each = var.gke_cluster_exists ? local.apply_manifests_map : {}
   source   = "./kubectl"
 
   content           = each.value.content
@@ -34,7 +34,6 @@ module "kubectl_apply_manifests" {
   template_vars     = each.value.template_vars
   server_side_apply = each.value.server_side_apply
   wait_for_rollout  = each.value.wait_for_rollout
-  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
@@ -42,10 +41,10 @@ module "kubectl_apply_manifests" {
 }
 
 module "install_kueue" {
+  count             = var.gke_cluster_exists ? 1 : 0
   source            = "./kubectl"
   source_path       = local.install_kueue ? local.kueue_install_source : null
   server_side_apply = true
-  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
@@ -53,10 +52,10 @@ module "install_kueue" {
 }
 
 module "install_jobset" {
+  count             = var.gke_cluster_exists ? 1 : 0
   source            = "./kubectl"
   source_path       = local.install_jobset ? local.jobset_install_source : null
   server_side_apply = true
-  depends_on        = [var.gke_cluster_exists]
 
   providers = {
     http = http.h
@@ -64,6 +63,7 @@ module "install_jobset" {
 }
 
 module "configure_kueue" {
+  count         = var.gke_cluster_exists ? 1 : 0
   source        = "./kubectl"
   source_path   = local.install_kueue ? try(var.kueue.config_path, "") : null
   template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index cb1cc6b690..356d268d9f 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -40,6 +40,7 @@ resource "terraform_data" "jobset_validations" {
 variable "gke_cluster_exists" {
   description = "A static flag that signals to modules that a cluster has been created."
   type        = bool
+  default     = false
 }
 
 variable "apply_manifests" {
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 55188acb6b..2d1dca3267 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -353,4 +353,6 @@ module "kubectl_apply" {
       }
     ]
   ])
+
+  depends_on = [google_container_cluster.gke_cluster]
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf
index e90c8877ed..95761ef10d 100644
--- a/modules/scheduler/pre-existing-gke-cluster/main.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/main.tf
@@ -66,4 +66,6 @@ module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
   apply_manifests = concat(local.apply_manifests_non_rdma_networks, local.apply_manifests_rdma_networks)
+
+  depends_on = [data.google_container_cluster.existing_gke_cluster]
 }
diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index 6e6d722c51..a2f67b2b5a 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -187,14 +187,13 @@ func (bp Blueprint) expandBackend(grp *Group) {
 	}
 }
 
-func kubectlProviderRequiredModules(grp *Group) []Module {
-	mods := []Module{}
+func kubectlProviderRequiredModule(grp *Group) (bool, Module) {
 	for _, mod := range grp.Modules {
 		if strings.Contains(mod.Source, "gke-cluster") || strings.Contains(mod.Source, "pre-existing-gke-cluster") {
-			mods = append(mods, mod)
+			return true, mod
 		}
 	}
-	return mods
+	return false, Module{}
 }
 
 func getModuleKubectlProviders(mod Module) map[string]TerraformProvider {
@@ -249,8 +248,7 @@ func (bp Blueprint) expandProviders(grp *Group) {
 	if (*pv) == nil {
 		(*pv) = maps.Clone(defaults)
 	}
-	mods := kubectlProviderRequiredModules(grp)
-	for _, mod := range mods {
+	if ok, mod := kubectlProviderRequiredModule(grp); ok {
 		maps.Copy((*pv), getModuleKubectlProviders(mod))
 	}
 }

From d07250519d55af03efc2575053408cf50d8497eb Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Sat, 14 Dec 2024 13:06:20 +0000
Subject: [PATCH 022/140] Added changes related to kubectl-apply module

---
 modules/compute/gke-node-pool/README.md    | 1 +
 modules/compute/gke-node-pool/main.tf      | 1 +
 modules/compute/gke-node-pool/variables.tf | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index d2715ff652..17d5a95fee 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -322,6 +322,7 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
 | <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(object({<br/>      gpu_driver_version = string<br/>    }), { gpu_driver_version = "DEFAULT" })<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(object({<br/>      gpu_sharing_strategy       = string<br/>      max_shared_clients_per_gpu = number<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index 5d4bf02fb2..a0a334356e 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -353,6 +353,7 @@ resource "null_resource" "enable_tcpxo_in_workload" {
 
 # apply manifest to enable tcpx
 module "kubectl_apply" {
+  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../management/kubectl-apply"
 
   apply_manifests = flatten([
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index d3b403b564..b15fc3f3ef 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -24,6 +24,12 @@ variable "cluster_id" {
   type        = string
 }
 
+variable "gke_cluster_exists" {
+  description = "A static flag that signals to modules that a cluster has been created."
+  type        = bool
+  default     = false
+}
+
 variable "zones" {
   description = "A list of zones to be used. Zones must be in region of cluster. If null, cluster zones will be inherited. Note `zones` not `zone`; does not work with `zone` deployment variable."
   type        = list(string)

From f45e151fb96573765c804d2c6fafdabae0120996 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Sat, 14 Dec 2024 13:32:58 +0000
Subject: [PATCH 023/140] Added changes related to kubectl-apply module

---
 community/modules/compute/gke-topology-scheduler/main.tf | 3 ++-
 modules/compute/gke-node-pool/main.tf                    | 3 ++-
 modules/file-system/gke-persistent-volume/README.md      | 2 +-
 modules/file-system/gke-persistent-volume/main.tf        | 5 +++--
 modules/file-system/gke-persistent-volume/variables.tf   | 1 +
 modules/file-system/gke-storage/main.tf                  | 3 ++-
 modules/scheduler/gke-cluster/main.tf                    | 2 ++
 modules/scheduler/pre-existing-gke-cluster/main.tf       | 2 ++
 8 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/community/modules/compute/gke-topology-scheduler/main.tf b/community/modules/compute/gke-topology-scheduler/main.tf
index 1c2b658668..3a79befcf3 100644
--- a/community/modules/compute/gke-topology-scheduler/main.tf
+++ b/community/modules/compute/gke-topology-scheduler/main.tf
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 module "kubectl_apply" {
-  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../../../modules/management/kubectl-apply"
 
+  gke_cluster_exists = var.gke_cluster_exists
+
   apply_manifests = [
     { source = "${path.module}/manifests/topology-scheduler-scripts.yaml" },
     { source = "${path.module}/manifests/service-account.yaml" },
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index a0a334356e..64013374f1 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -353,9 +353,10 @@ resource "null_resource" "enable_tcpxo_in_workload" {
 
 # apply manifest to enable tcpx
 module "kubectl_apply" {
-  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../management/kubectl-apply"
 
+  gke_cluster_exists = var.gke_cluster_exists
+
   apply_manifests = flatten([
     for manifest in local.gpu_direct_setting.gpu_direct_manifests : [
       {
diff --git a/modules/file-system/gke-persistent-volume/README.md b/modules/file-system/gke-persistent-volume/README.md
index 23bce2de8d..b5967763c9 100644
--- a/modules/file-system/gke-persistent-volume/README.md
+++ b/modules/file-system/gke-persistent-volume/README.md
@@ -150,7 +150,7 @@ No modules.
 | <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
 | <a name="input_filestore_id"></a> [filestore\_id](#input\_filestore\_id) | An identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `string` | `null` | no |
 | <a name="input_gcs_bucket_name"></a> [gcs\_bucket\_name](#input\_gcs\_bucket\_name) | The gcs bucket to be used with the persistent volume. | `string` | `null` | no |
-| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | n/a | yes |
+| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | Network attached storage mount to be configured. | <pre>object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  })</pre> | n/a | yes |
 
diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf
index df8b4733d8..d12c5d6d39 100644
--- a/modules/file-system/gke-persistent-volume/main.tf
+++ b/modules/file-system/gke-persistent-volume/main.tf
@@ -88,12 +88,13 @@ resource "local_file" "debug_file" {
 }
 
 resource "kubectl_manifest" "pv" {
+  count     = var.gke_cluster_exists ? 1 : 0
   yaml_body = local.is_gcs ? local.gcs_pv_contents : local.filestore_pv_contents
 
   lifecycle {
     precondition {
-      condition     = var.gke_cluster_exists && (var.gcs_bucket_name != null) != (var.filestore_id != null)
-      error_message = "GKE cluster should exists and either gcs_bucket_name or filestore_id must be set."
+      condition     = (var.gcs_bucket_name != null) != (var.filestore_id != null)
+      error_message = "Either gcs_bucket_name or filestore_id must be set."
     }
   }
 }
diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf
index 88ff9c36b8..96e3f31949 100644
--- a/modules/file-system/gke-persistent-volume/variables.tf
+++ b/modules/file-system/gke-persistent-volume/variables.tf
@@ -17,6 +17,7 @@
 variable "gke_cluster_exists" {
   description = "A static flag that signals to modules that a cluster has been created."
   type        = bool
+  default     = false
 }
 
 variable "network_storage" {
diff --git a/modules/file-system/gke-storage/main.tf b/modules/file-system/gke-storage/main.tf
index bb738162e5..f26936de0c 100644
--- a/modules/file-system/gke-storage/main.tf
+++ b/modules/file-system/gke-storage/main.tf
@@ -37,9 +37,10 @@ check "private_vpc_connection_peering" {
 }
 
 module "kubectl_apply" {
-  count  = var.gke_cluster_exists ? 1 : 0
   source = "../../management/kubectl-apply"
 
+  gke_cluster_exists = var.gke_cluster_exists
+
   # count = var.pvc_count
   apply_manifests = flatten(
     [
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 2d1dca3267..823bfb5cbe 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -336,6 +336,8 @@ module "workload_identity" {
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
+  gke_cluster_exists = true
+
   apply_manifests = flatten([
     for idx, network_info in var.additional_networks : [
       {
diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf
index 95761ef10d..800ed87a51 100644
--- a/modules/scheduler/pre-existing-gke-cluster/main.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/main.tf
@@ -65,6 +65,8 @@ data "google_client_config" "default" {}
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
+  gke_cluster_exists = true
+
   apply_manifests = concat(local.apply_manifests_non_rdma_networks, local.apply_manifests_rdma_networks)
 
   depends_on = [data.google_container_cluster.existing_gke_cluster]

From 3b2ab776a08e2055310f8ec27e89bfed955cd264 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Mon, 16 Dec 2024 07:37:46 +0000
Subject: [PATCH 024/140] SlurmGCP. Reduce usage of NSDict

---
 .../modules/slurm_files/scripts/resume.py     | 47 ++++++++-----------
 .../modules/slurm_files/scripts/slurmsync.py  |  3 +-
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 669ccfc0a7..5d88751a41 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -155,11 +155,6 @@ def dws_flex_duration(dws_flex:object, job_id: Optional[int]) -> int:
             log.info("Job TimeLimit cannot be less than 30 seconds or exceed 2 weeks")
     return max_duration
 
-def per_instance_properties(node):
-    props = NSDict()
-    # No properties beyond name are supported yet.
-
-    return props
 
 def create_instances_request(nodes: List[str], placement_group: Optional[str], excl_job_id: Optional[int]):
     """Call regionInstances.bulkInsert to create instances"""
@@ -167,31 +162,27 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e
 
     # model here indicates any node that can be used to describe the rest
     model = next(iter(nodes))
-    nodeset = lookup().node_nodeset(model)
-    template = lookup().node_template(model)
     log.debug(f"create_instances_request: {model} placement: {placement_group}")
 
-    body = NSDict()
+    nodeset = lookup().node_nodeset(model)
+    template = lookup().node_template(model)
+    labels = {"slurm_job_id": excl_job_id} if excl_job_id else None
 
-    body.count = len(nodes)
+    body = dict(
+        count = len(nodes),
+        sourceInstanceTemplate = template,
+        # key is instance name, value overwrites properties (no overwrites)
+        perInstanceProperties = {k: {} for k in nodes},
+        instanceProperties = instance_properties(
+            nodeset, model, placement_group, labels, excl_job_id
+        ),
+    )
 
     if placement_group:
         assert len(nodes) <= PLACEMENT_MAX_CNT
         pass # do not set minCount to force "all or nothing" behavior
     else:
-        body.minCount = 1
-
-    # source of instance properties
-    body.sourceInstanceTemplate = template
-
-    labels = {"slurm_job_id": excl_job_id} if excl_job_id else None
-    # overwrites properties across all instances
-    body.instanceProperties = instance_properties(
-        nodeset, model, placement_group, labels, excl_job_id
-    )
-
-    # key is instance name, value overwrites properties
-    body.perInstanceProperties = {k: per_instance_properties(k) for k in nodes}
+        body["minCount"] = 1
 
     zone_allow = nodeset.zone_policy_allow or []
     zone_deny = nodeset.zone_policy_deny or []
@@ -203,10 +194,12 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e
         api_method = lookup().compute.regionInstances().bulkInsert
         method_args = {"region": lookup().node_region(model)}
         
-        body.locationPolicy.locations = {
-            **{ f"zones/{z}": {"preference": "ALLOW"} for z in zone_allow },
-            **{ f"zones/{z}": {"preference": "DENY"} for z in zone_deny }}
-        body.locationPolicy.targetShape = nodeset.zone_target_shape
+        body["locationPolicy"] = dict(
+            locations = {
+                **{ f"zones/{z}": {"preference": "ALLOW"} for z in zone_allow },
+                **{ f"zones/{z}": {"preference": "DENY"} for z in zone_deny }},
+            targetShape = nodeset.zone_target_shape,
+        )
     
     if lookup().cfg.enable_slurm_gcp_plugins:
         slurm_gcp_plugins.pre_instance_bulk_insert(
@@ -218,7 +211,7 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e
 
     req = api_method(
         project=lookup().project, 
-        body=body.to_dict(), 
+        body=body, 
         **method_args)
     log.debug(f"new request: endpoint={req.methodId} nodes={to_hostlist(nodes)}")
     log_api_request(req)
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index 8b67365e68..21d9324e79 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -39,7 +39,6 @@
     run,
     separate,
     to_hostlist,
-    NSDict,
     NodeState,
     TPU,
     chunked,
@@ -363,7 +362,7 @@ def sync_placement_groups():
         result = ensure_execute(op)
         # merge placement group info from API and job_id,partition,index parsed from the name
         pgs = (
-            NSDict({**pg, **pg_regex.match(pg["name"]).groupdict()})
+            {**pg, **pg_regex.match(pg["name"]).groupdict()}
             for pg in chain.from_iterable(
                 item["resourcePolicies"]
                 for item in result.get("items", {}).values()

From 926f5ed34f045bbe08f3392242553e2628f8b5f5 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 10:15:41 +0000
Subject: [PATCH 025/140] Update README for parallelstore related example
 blueprint

---
 examples/README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 30883ce0f9..4abeb289f9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1518,6 +1518,30 @@ cleaned up when the job is deleted.
 
 [storage-gke.yaml]: ../examples/storage-gke.yaml
 
+### [gke-storage-parallelstore.yaml] ![core-badge] ![experimental-badge]
+
+This blueprint shows how to use parallelstore storage options with GKE in the toolkit.
+
+The blueprint contains the following:
+
+* A K8s Job that uses a parallelstore storage volume option.
+* A K8s Job that demonstrates ML training workload with parallelstore storage disk ops.
+
+> **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module.
+> The lifecycle of the parallelstore is not managed by the blueprint.
+> On glcuster destroy ops, the Parallelstore created will also be destroyed.
+>
+> [!Note]
+> The Kubernetes API server will only allow requests from authorized networks.
+> The `gke-cluster` module needs access to the Kubernetes API server
+> to create a Persistent Volume and a Persistent Volume Claim. **You must use
+> the `authorized_cidr` variable to supply an authorized network which contains
+> the IP address of the machine deploying the blueprint, for example
+> `--vars authorized_cidr=<your-ip-address>/32`.** You can use a service like
+> [whatismyip.com](https://whatismyip.com) to determine your IP address.
+
+[gke-storage-parallelstore.yaml]: ../examples/gke-storage-parallelstore.yaml
+
 ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge]
 
 This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit.

From 67be248c364f6078f5031a8688725fe740cc9a28 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 11:11:11 +0000
Subject: [PATCH 026/140] Update README with GKE parallelstore related example
 blueprint details

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 4abeb289f9..95cea23dae 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1528,8 +1528,8 @@ The blueprint contains the following:
 * A K8s Job that demonstrates ML training workload with parallelstore storage disk ops.
 
 > **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module.
-> The lifecycle of the parallelstore is not managed by the blueprint.
-> On glcuster destroy ops, the Parallelstore created will also be destroyed.
+> The lifecycle of the parallelstore is managed by the blueprint.
+> On glcuster destroy ops, the Parallelstore storage created will also be destroyed.
 >
 > [!Note]
 > The Kubernetes API server will only allow requests from authorized networks.

From c59164fe99839a1549a864a99e4f1cc1de32cfc4 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 12:31:40 +0000
Subject: [PATCH 027/140] Added unit test cases

---
 pkg/config/expand.go      | 15 ++++++---
 pkg/config/expand_test.go | 70 +++++++++++++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index a2f67b2b5a..5ba931bcbb 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -197,16 +197,23 @@ func kubectlProviderRequiredModule(grp *Group) (bool, Module) {
 }
 
 func getModuleKubectlProviders(mod Module) map[string]TerraformProvider {
+	modOutputs := []string{}
+	for idx := range mod.Outputs {
+		modOutputs = append(modOutputs, mod.Outputs[idx].Name)
+	}
+
 	kubectlConf := Dict{}
 	for s, v := range map[string]string{
 		"cluster_ca_certificate": "cluster_ca_certificate",
 		"host":                   "host_endpoint",
 		"token":                  "access_token"} {
-		kubectlConf = kubectlConf.With(s, ModuleRef(mod.ID, v).AsValue())
+		if slices.Contains(modOutputs, v) {
+			kubectlConf = kubectlConf.With(s, ModuleRef(mod.ID, v).AsValue())
+		}
 	}
-	// kubectlConf = kubectlConf.With("alias", cty.StringVal(string(mod.ID)))
-	kubectlConf = kubectlConf.With("apply_retry_count", cty.NumberIntVal(15))
-	kubectlConf = kubectlConf.With("load_config_file", cty.BoolVal(false))
+	kubectlConf = kubectlConf.
+		With("apply_retry_count", cty.NumberIntVal(15)).
+		With("load_config_file", cty.BoolVal(false))
 	return map[string]TerraformProvider{
 		"kubectl": {
 			Source:        "gavinbunney/kubectl",
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index e1ad008407..7774dbc90e 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -87,16 +87,72 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 				With("zone", cty.StringVal("zone1")).
 				With("universe_domain", cty.StringVal("test-universe.com"))}}
 
+	defaultProvider := map[string]PR{
+		"google": TerraformProvider{
+			Source:  "hashicorp/google",
+			Version: "~> 6.13.0"},
+		"google-beta": TerraformProvider{
+			Source:  "hashicorp/google-beta",
+			Version: "~> 6.13.0"}}
+
+	testGKEClusterModuleID := ModuleID("dummy_cluster")
+	testGKEClusterModuleOutputName := "host_endpoint"
+
+	kubectlProvider := PR{
+		Source:  "gavinbunney/kubectl",
+		Version: ">= 1.7.0",
+		Configuration: Dict{}.
+			With("host", ModuleRef(testGKEClusterModuleID, testGKEClusterModuleOutputName).AsValue()).
+			With("apply_retry_count", cty.NumberIntVal(15)).
+			With("load_config_file", cty.BoolVal(false))}
+
+	testModuleOutputs := []modulereader.OutputInfo{
+		{Name: testGKEClusterModuleOutputName}}
+
+	testGKEClusterModule := Module{
+		Source:  "module/test/gke-cluster",
+		ID:      testGKEClusterModuleID,
+		Outputs: testModuleOutputs}
+
+	testPreExistingGKEClusterModule := Module{
+		Source:  "module/test/pre-existing-gke-cluster",
+		ID:      testGKEClusterModuleID,
+		Outputs: testModuleOutputs}
+
 	{ // no def PR, no group PR - match default values
 		g := Group{Name: "clown"}
 		noDefPr.expandProviders(&g)
-		c.Check(g.TerraformProviders, DeepEquals, map[string]PR{
-			"google": TerraformProvider{
-				Source:  "hashicorp/google",
-				Version: "~> 6.13.0"},
-			"google-beta": TerraformProvider{
-				Source:  "hashicorp/google-beta",
-				Version: "~> 6.13.0"}})
+		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
+	}
+
+	{ // no def PR, no group PR, group only have gke cluster module
+		g := Group{
+			Name:    "clown",
+			Modules: []Module{testGKEClusterModule}}
+		defaultProvider["kubectl"] = kubectlProvider
+		noDefPr.expandProviders(&g)
+		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
+		delete(defaultProvider, "kubectl")
+	}
+
+	{ // no def PR, no group PR, group only have pre existing gke cluster module
+		g := Group{
+			Name:    "clown",
+			Modules: []Module{testPreExistingGKEClusterModule}}
+		defaultProvider["kubectl"] = kubectlProvider
+		noDefPr.expandProviders(&g)
+		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
+		delete(defaultProvider, "kubectl")
+	}
+
+	{ // no def PR, no group PR, group have both gke cluster and pre existing gke cluster module
+		g := Group{
+			Name:    "clown",
+			Modules: []Module{testGKEClusterModule, testPreExistingGKEClusterModule}}
+		defaultProvider["kubectl"] = kubectlProvider
+		noDefPr.expandProviders(&g)
+		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
+		delete(defaultProvider, "kubectl")
 	}
 
 	{ // no def PR, group PR

From 93fa0f4a18b7a5935352b7b6c809fa7c8bf26d35 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 12:37:00 +0000
Subject: [PATCH 028/140] Updated unit test case

---
 pkg/config/expand_test.go | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index 7774dbc90e..625b8c2c15 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -106,18 +106,18 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 			With("apply_retry_count", cty.NumberIntVal(15)).
 			With("load_config_file", cty.BoolVal(false))}
 
-	testModuleOutputs := []modulereader.OutputInfo{
+	testGKEClusterModuleOutputs := []modulereader.OutputInfo{
 		{Name: testGKEClusterModuleOutputName}}
 
 	testGKEClusterModule := Module{
-		Source:  "module/test/gke-cluster",
+		Source:  "module/test/gke-cluster/dummy",
 		ID:      testGKEClusterModuleID,
-		Outputs: testModuleOutputs}
+		Outputs: testGKEClusterModuleOutputs}
 
 	testPreExistingGKEClusterModule := Module{
-		Source:  "module/test/pre-existing-gke-cluster",
+		Source:  "module/test/pre-existing-gke-cluster/dummy",
 		ID:      testGKEClusterModuleID,
-		Outputs: testModuleOutputs}
+		Outputs: testGKEClusterModuleOutputs}
 
 	{ // no def PR, no group PR - match default values
 		g := Group{Name: "clown"}
@@ -145,16 +145,6 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		delete(defaultProvider, "kubectl")
 	}
 
-	{ // no def PR, no group PR, group have both gke cluster and pre existing gke cluster module
-		g := Group{
-			Name:    "clown",
-			Modules: []Module{testGKEClusterModule, testPreExistingGKEClusterModule}}
-		defaultProvider["kubectl"] = kubectlProvider
-		noDefPr.expandProviders(&g)
-		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
-		delete(defaultProvider, "kubectl")
-	}
-
 	{ // no def PR, group PR
 		g := Group{
 			Name:               "clown",

From 90aa74bb9470f7c74f6dd614437383730e3c8e31 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 13:21:32 +0000
Subject: [PATCH 029/140] Updated unit test cases

---
 pkg/config/expand.go      |  9 +------
 pkg/config/expand_test.go | 54 ++++++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index 5ba931bcbb..1005eb780f 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -197,19 +197,12 @@ func kubectlProviderRequiredModule(grp *Group) (bool, Module) {
 }
 
 func getModuleKubectlProviders(mod Module) map[string]TerraformProvider {
-	modOutputs := []string{}
-	for idx := range mod.Outputs {
-		modOutputs = append(modOutputs, mod.Outputs[idx].Name)
-	}
-
 	kubectlConf := Dict{}
 	for s, v := range map[string]string{
 		"cluster_ca_certificate": "cluster_ca_certificate",
 		"host":                   "host_endpoint",
 		"token":                  "access_token"} {
-		if slices.Contains(modOutputs, v) {
-			kubectlConf = kubectlConf.With(s, ModuleRef(mod.ID, v).AsValue())
-		}
+		kubectlConf = kubectlConf.With(s, ModuleRef(mod.ID, v).AsValue())
 	}
 	kubectlConf = kubectlConf.
 		With("apply_retry_count", cty.NumberIntVal(15)).
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index 625b8c2c15..f9f273efd8 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -87,37 +87,39 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 				With("zone", cty.StringVal("zone1")).
 				With("universe_domain", cty.StringVal("test-universe.com"))}}
 
-	defaultProvider := map[string]PR{
-		"google": TerraformProvider{
-			Source:  "hashicorp/google",
-			Version: "~> 6.13.0"},
-		"google-beta": TerraformProvider{
-			Source:  "hashicorp/google-beta",
-			Version: "~> 6.13.0"}}
-
 	testGKEClusterModuleID := ModuleID("dummy_cluster")
-	testGKEClusterModuleOutputName := "host_endpoint"
 
-	kubectlProvider := PR{
-		Source:  "gavinbunney/kubectl",
-		Version: ">= 1.7.0",
-		Configuration: Dict{}.
-			With("host", ModuleRef(testGKEClusterModuleID, testGKEClusterModuleOutputName).AsValue()).
-			With("apply_retry_count", cty.NumberIntVal(15)).
-			With("load_config_file", cty.BoolVal(false))}
+	testKubectlConf := Dict{}
+	for s, v := range map[string]string{
+		"cluster_ca_certificate": "cluster_ca_certificate",
+		"host":                   "host_endpoint",
+		"token":                  "access_token"} {
+		testKubectlConf = testKubectlConf.With(s, ModuleRef(testGKEClusterModuleID, v).AsValue())
+	}
+	testKubectlConf = testKubectlConf.
+		With("apply_retry_count", cty.NumberIntVal(15)).
+		With("load_config_file", cty.BoolVal(false))
 
-	testGKEClusterModuleOutputs := []modulereader.OutputInfo{
-		{Name: testGKEClusterModuleOutputName}}
+	testKubectlProvider := PR{
+		Source:        "gavinbunney/kubectl",
+		Version:       ">= 1.7.0",
+		Configuration: testKubectlConf}
 
 	testGKEClusterModule := Module{
-		Source:  "module/test/gke-cluster/dummy",
-		ID:      testGKEClusterModuleID,
-		Outputs: testGKEClusterModuleOutputs}
+		Source: "module/test/gke-cluster/dummy",
+		ID:     testGKEClusterModuleID}
 
 	testPreExistingGKEClusterModule := Module{
-		Source:  "module/test/pre-existing-gke-cluster/dummy",
-		ID:      testGKEClusterModuleID,
-		Outputs: testGKEClusterModuleOutputs}
+		Source: "module/test/pre-existing-gke-cluster/dummy",
+		ID:     testGKEClusterModuleID}
+
+	defaultProvider := map[string]PR{
+		"google": TerraformProvider{
+			Source:  "hashicorp/google",
+			Version: "~> 6.13.0"},
+		"google-beta": TerraformProvider{
+			Source:  "hashicorp/google-beta",
+			Version: "~> 6.13.0"}}
 
 	{ // no def PR, no group PR - match default values
 		g := Group{Name: "clown"}
@@ -129,7 +131,7 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		g := Group{
 			Name:    "clown",
 			Modules: []Module{testGKEClusterModule}}
-		defaultProvider["kubectl"] = kubectlProvider
+		defaultProvider["kubectl"] = testKubectlProvider
 		noDefPr.expandProviders(&g)
 		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
 		delete(defaultProvider, "kubectl")
@@ -139,7 +141,7 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		g := Group{
 			Name:    "clown",
 			Modules: []Module{testPreExistingGKEClusterModule}}
-		defaultProvider["kubectl"] = kubectlProvider
+		defaultProvider["kubectl"] = testKubectlProvider
 		noDefPr.expandProviders(&g)
 		c.Check(g.TerraformProviders, DeepEquals, defaultProvider)
 		delete(defaultProvider, "kubectl")

From 3361ed5f682eb2001fe3ef9bc59497b42bfd2544 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Mon, 16 Dec 2024 14:10:50 +0000
Subject: [PATCH 030/140] Update README with GKE parallelstore related example
 blueprint details

---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 95cea23dae..b2a7bd7b3c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1527,7 +1527,7 @@ The blueprint contains the following:
 * A K8s Job that uses a parallelstore storage volume option.
 * A K8s Job that demonstrates ML training workload with parallelstore storage disk ops.
 
-> **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module.
+> **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module.
 > The lifecycle of the parallelstore is managed by the blueprint.
 > On glcuster destroy ops, the Parallelstore storage created will also be destroyed.
 >

From 1f1f978f68308be5b838c3c440b99df7bcdd17b8 Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Mon, 16 Dec 2024 22:10:41 +0000
Subject: [PATCH 031/140] Move gke a3u blueprints to develop to enable
 integration testing

---
 examples/gke-a3-ultragpu/README.md            |   1 +
 .../gke-a3-ultragpu-deployment.yaml           |  16 ++
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 212 ++++++++++++++++++
 examples/gke-a3-ultragpu/mglru-disable.yaml   |  59 +++++
 examples/gke-a3-ultragpu/nccl-installer.yaml  |  80 +++++++
 .../gke-a3-ultragpu/nccl-jobset-example.yaml  | 208 +++++++++++++++++
 .../gke-a3-ultragpu/nccl-test-32-node.yaml    | 208 +++++++++++++++++
 examples/gke-a3-ultragpu/nccl-test.yaml       | 149 ++++++++++++
 8 files changed, 933 insertions(+)
 create mode 100644 examples/gke-a3-ultragpu/README.md
 create mode 100644 examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
 create mode 100644 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
 create mode 100644 examples/gke-a3-ultragpu/mglru-disable.yaml
 create mode 100644 examples/gke-a3-ultragpu/nccl-installer.yaml
 create mode 100644 examples/gke-a3-ultragpu/nccl-jobset-example.yaml
 create mode 100644 examples/gke-a3-ultragpu/nccl-test-32-node.yaml
 create mode 100644 examples/gke-a3-ultragpu/nccl-test.yaml

diff --git a/examples/gke-a3-ultragpu/README.md b/examples/gke-a3-ultragpu/README.md
new file mode 100644
index 0000000000..73b37bbfcb
--- /dev/null
+++ b/examples/gke-a3-ultragpu/README.md
@@ -0,0 +1 @@
+Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.
\ No newline at end of file
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
new file mode 100644
index 0000000000..b7a8d24071
--- /dev/null
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
@@ -0,0 +1,16 @@
+---
+    terraform_backend_defaults:
+      type: gcs
+      configuration:
+        bucket: BUCKET_NAME
+
+    vars:
+      deployment_name: gke-a3-ultra
+      project_id: PROJECT_ID
+      region: COMPUTE_REGION
+      zone: COMPUTE_ZONE
+      authorized_cidr: <IP_ADDRESS>/<SUFFIX>
+      # In order to not target a BLOCK_NAME, extended_reservation can be inputed as 
+      # extended_reservation: RESERVATION_NAME
+      extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
+      static_node_count: NODE_COUNT
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
new file mode 100644
index 0000000000..7069b90797
--- /dev/null
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -0,0 +1,212 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+blueprint_name: gke-a3-ultra
+
+vars:
+  project_id: # add this
+  deployment_name: # add this
+  region: # add this
+  zone: # add this
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: # add this
+  extended_reservation: # add this
+  # Installs NCCL library and Google NCCL plugin
+  # Runs an init container on all H200 GPU nodes with the NCCL plugin image
+  nccl_installer_path: $(ghpc_stage("./nccl-installer.yaml"))
+  # Temporary fix for COS issue, will be fixed in next release
+  mglru_disable_path: $(ghpc_stage("./mglru-disable.yaml"))
+  mtu_size: 8896
+  static_node_count:  # add this
+  system_node_pool_disk_size_gb: 200
+  a3ultra_node_pool_disk_size_gb: 100
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: gke-a3-ultra-net-0
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
+    settings:
+      network_name: gke-a3-ultra-net-0
+      subnetworks:
+      - subnet_name: gke-a3-ultra-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.0.0/18
+      secondary_ranges:
+        gke-a3-ultra-sub-0:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+      firewall_rules:
+      - name: gke-a3-ultra-internal-0
+        ranges: [192.168.0.0/16]
+        allow:
+        - protocol: tcp
+          ports: ["0-65535"]
+        - protocol: udp
+          ports: ["0-65535"]
+        - protocol: icmp
+
+  - id: gke-a3-ultra-net-1
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
+    settings:
+      network_name: gke-a3-ultra-net-1
+      mtu: $(vars.mtu_size)
+      subnetworks:
+      - subnet_name: gke-a3-ultra-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.64.0/18
+      firewall_rules:
+      - name: gke-a3-ultra-internal-1
+        ranges: [192.168.0.0/16]
+        allow:
+        - protocol: tcp
+          ports: ["0-65535"]
+        - protocol: udp
+          ports: ["0-65535"]
+        - protocol: icmp
+
+  - id: gke-a3-ultra-rdma-net
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe
+    settings:
+      network_name: gke-a3-ultra-rdma-net
+      mtu: $(vars.mtu_size)
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      subnetworks_template:
+        name_prefix: gke-a3-ultra-rdma-sub
+        count: 8
+        ip_range: 192.168.128.0/18
+        region: $(vars.region)
+
+  - id: a3-ultragpu-cluster
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b
+    use: [gke-a3-ultra-net-0]
+    settings:
+      release_channel: RAPID
+      system_node_pool_machine_type: "e2-standard-16"
+      system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
+      system_node_pool_taints: []
+      enable_dcgm_monitoring: true
+      enable_gcsfuse_csi: true
+      enable_private_endpoint: false # Allows access from authorized public IPs
+      master_authorized_networks:
+      - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
+        display_name: "kubectl-access-network"
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES      
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: a3-ultragpu-pool
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+    settings:
+      machine_type: a3-ultragpu-8g
+      auto_upgrade: true
+      zones: [$(vars.zone)]
+      disk_type: hyperdisk-balanced
+      disk_size_gb: $(vars.a3ultra_node_pool_disk_size_gb)
+      static_node_count: $(vars.static_node_count)
+      guest_accelerator:
+      - type: nvidia-h200-141gb
+        count: 8
+        gpu_driver_installation_config:
+          gpu_driver_version: "LATEST"
+      reservation_affinity:
+        consume_reservation_type: SPECIFIC_RESERVATION
+        specific_reservations:
+        - name: $(vars.extended_reservation)
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: topology-aware-scheduler-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+
+  - id: workload-manager-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b
+    use: [a3-ultragpu-cluster]
+    settings:
+      kueue:
+        install: true
+        version: v0.9.1
+      jobset:
+        install: true
+        version: v0.7.1
+      apply_manifests:
+      - source: $(vars.nccl_installer_path)
+      - source: $(vars.mglru_disable_path)
+
+  - id: job-template
+    source: modules/compute/gke-job-template
+    use: [a3-ultragpu-pool]
+    settings:
+      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04
+      command:
+      - nvidia-smi
+      node_count: 2
+      name: run-nvidia-smi
+    outputs: [instructions]
+
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 6.13.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+  google-beta:
+    source: hashicorp/google-beta
+    version: 6.13.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
diff --git a/examples/gke-a3-ultragpu/mglru-disable.yaml b/examples/gke-a3-ultragpu/mglru-disable.yaml
new file mode 100644
index 0000000000..f0bc1c8caf
--- /dev/null
+++ b/examples/gke-a3-ultragpu/mglru-disable.yaml
@@ -0,0 +1,59 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: disable-mglru
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: disable-mglru
+  template:
+    metadata:
+      labels:
+        app: disable-mglru
+    spec:
+      hostNetwork: true
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      containers:
+      - name: disable-mglru
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          echo n | tee /sys/kernel/mm/lru_gen/enabled
+          sysctl -w net.ipv4.conf.eth2.log_martians=0
+          sysctl -w net.ipv4.conf.eth3.log_martians=0
+          sysctl -w net.ipv4.conf.eth4.log_martians=0
+          sysctl -w net.ipv4.conf.eth5.log_martians=0
+          sysctl -w net.ipv4.conf.eth6.log_martians=0
+          sysctl -w net.ipv4.conf.eth7.log_martians=0
+          sysctl -w net.ipv4.conf.eth8.log_martians=0
+          sysctl -w net.ipv4.conf.eth9.log_martians=0
+          sleep infinity
+        volumeMounts:
+        - name: sys-kernel-mm-lru-gen
+          mountPath: /sys/kernel/mm/lru_gen
+      # Remount sysfs so that it will be writable.
+      volumes:
+      - name: sys-kernel-mm-lru-gen
+        hostPath:
+          path: /sys/kernel/mm/lru_gen
diff --git a/examples/gke-a3-ultragpu/nccl-installer.yaml b/examples/gke-a3-ultragpu/nccl-installer.yaml
new file mode 100644
index 0000000000..f2239b2584
--- /dev/null
+++ b/examples/gke-a3-ultragpu/nccl-installer.yaml
@@ -0,0 +1,80 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nccl-rdma-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nccl-rdma-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nccl-rdma-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nccl-rdma-installer
+        k8s-app: nccl-rdma-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: cloud.google.com/gke-accelerator
+                    operator: In
+                    values:
+                      - nvidia-h200-141gb
+      tolerations:
+        - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia/lib64
+            type: DirectoryOrCreate
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+      initContainers:
+        - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+          name: nccl-rdma-installer
+          resources:
+            requests:
+              cpu: 150m
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: library-dir-host
+              mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
+            - name: gib
+              mountPath: /usr/local/home/kubernetes/bin/gib
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -ex
+              /scripts/container_entry.sh install --install-nccl
+              cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+              cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+              ibv_devinfo || exit 1
+              echo "installation finishes"
+      containers:
+        - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+          name: pause
diff --git a/examples/gke-a3-ultragpu/nccl-jobset-example.yaml b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
new file mode 100644
index 0000000000..da49668d0a
--- /dev/null
+++ b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
@@ -0,0 +1,208 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  generateName: ag-4-
+  namespace: default
+spec:
+  ttlSecondsAfterFinished: 1200
+  suspend: False
+  network:
+    enableDNSHostnames: true
+  replicatedJobs:
+    - name: w
+      template:
+        spec:
+          parallelism: 4
+          completions: 4
+
+          template:
+            metadata:
+              annotations:
+                networking.gke.io/default-interface: 'eth0'
+                networking.gke.io/interfaces: |
+                  [
+                    {"interfaceName":"eth0","network":"default"},
+                    {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+                    {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+                    {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+                    {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+                    {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+                    {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+                    {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+                    {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+                    {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+                  ]
+            spec:
+              # Limit benchmark run duration
+              activeDeadlineSeconds: 3600
+              restartPolicy: Never
+              nodeSelector:
+                cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+              tolerations:
+              - key: cloud.google.com/gke-queued
+                effect: NoSchedule
+                value: "true"
+
+              - key: "nvidia.com/gpu"
+                operator: "Exists"
+                effect: "NoSchedule"
+
+              setHostnameAsFQDN: true
+              volumes:
+              - name: gib
+                hostPath:
+                  path: /home/kubernetes/bin/gib
+              - name: nvidia
+                hostPath:
+                  path: /home/kubernetes/bin/nvidia
+              - name: lib64
+                hostPath:
+                  path: /lib64
+              - name: shared-memory
+                emptyDir:
+                  medium: "Memory"
+                  sizeLimit: 250Gi
+              - name: sys
+                hostPath:
+                  path: /sys
+              - name: proc-sys
+                hostPath:
+                  path: /proc/sys
+              schedulingGates:
+              # Set this to a unique name per job.
+              - name: "gke.io/topology-aware-auto-ag-4"
+
+              initContainers:
+              - name: gpu-healthcheck
+                image: alpine:latest
+                command: ["/bin/sh", "-c"]
+                args:
+                  - |
+                    apk add --no-cache bash  # Install bash
+                    /bin/bash -c "set -ex
+                    NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                    if [ \${NUM_GPUS} -lt 8 ]; then
+                      echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                      exit 1
+                    fi
+                    gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                    for gpu_index in \${!gpu_errors[@]}; do
+                        if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                            echo 'Error: ERR detected in GPU index '\$gpu_index
+                            exit 1
+                        elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                            echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                            exit 1
+                        fi
+                    done
+                    echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+
+                volumeMounts:
+                - name: nvidia
+                  mountPath: /usr/local/nvidia
+                - name: lib64
+                  mountPath: /lib64
+                securityContext:
+                  privileged: true
+                env:
+                - name: LD_LIBRARY_PATH
+                  value: /usr/local/nvidia/lib64
+
+              containers:
+              - name: nccl
+                stdin: true
+                tty: true
+                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+                securityContext:
+                  privileged: true
+                env:
+                - name: MY_NODE_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: spec.nodeName
+                - name: OMPI_ALLOW_RUN_AS_ROOT
+                  value: "1"
+                - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                  value: "1"
+                command:
+                - bash
+                - -c
+                - |
+                  set -x
+                  export N_NODES=4
+                  echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
+
+                  # Load all the cuda libs
+                  /sbin/ldconfig
+
+                  # Install ping
+                  apt update -y
+                  apt install -y iputils-ping
+
+                  # Start sshd
+                  /scripts/container_entry.sh daemon &
+
+                  # Get helper variables to form all hostnames
+                  export POSTFIX=$(hostname | cut -d . -f 2-)
+                  export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                  export NODE_RANK=$JOB_COMPLETION_INDEX
+
+
+                  # For every worker, wait till online and add to hostfile
+                  for i in `seq 0 $(($N_NODES-1))`; do
+                    OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                    until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
+                      echo Waiting for ${OTHER}...
+                      sleep 10
+                    done
+                    echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                  done
+
+                  cat /tmp/hostfile
+
+                  # Launch from head node
+                  if [[ "${NODE_RANK}" -eq "0" ]]; then
+
+                      # World Level = 0x0, Rail Aligned = 0x7
+                      export NCCL_TESTS_SPLIT_MASK="0x0";
+
+                      # Force use of libnccl-gib
+                      export NCCL_NET=gIB
+
+                      # Set all the correct libnccl-gib environment variables
+                      source /usr/local/gib/scripts/set_nccl_env.sh
+
+                      # Get all relevant NCCL / env vars to pass to all workers
+                      ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
+
+                      mpirun --hostfile /tmp/hostfile \
+                        -x $ENV_VARS  \
+                        -mca plm_rsh_no_tree_spawn 1 \
+                        --mca orte_keep_fqdn_hostnames 1 \
+                        --mca btl self,tcp \
+                        --mca btl_tcp_if_include eth0 \
+                        --bind-to none \
+                        --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
+                        /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
+
+                  else
+                      while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                      sleep 5
+                  done
+                  fi
+
+                  exit 0
+
+                volumeMounts:
+                - name: nvidia
+                  mountPath: /usr/local/nvidia
+                - name: gib
+                  mountPath: /usr/local/gib
+                - name: shared-memory
+                  mountPath: /dev/shm
+                resources:
+                  limits:
+                    nvidia.com/gpu: 8
+                  requests:
+                    nvidia.com/gpu: 8
+              restartPolicy: Never
diff --git a/examples/gke-a3-ultragpu/nccl-test-32-node.yaml b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
new file mode 100644
index 0000000000..3ce2b490d6
--- /dev/null
+++ b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
@@ -0,0 +1,208 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  generateName: ag-32-
+  namespace: default
+spec:
+  ttlSecondsAfterFinished: 1200
+  suspend: False
+  network:
+    enableDNSHostnames: true
+  replicatedJobs:
+    - name: w
+      template:
+        spec:
+          parallelism: 32
+          completions: 32
+
+          template:
+            metadata:
+              annotations:
+                networking.gke.io/default-interface: 'eth0'
+                networking.gke.io/interfaces: |
+                  [
+                    {"interfaceName":"eth0","network":"default"},
+                    {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+                    {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+                    {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+                    {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+                    {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+                    {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+                    {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+                    {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+                    {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+                  ]
+            spec:
+              # Limit benchmark run duration
+              activeDeadlineSeconds: 3600
+              restartPolicy: Never
+              nodeSelector:
+                cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+              tolerations:
+              - key: cloud.google.com/gke-queued
+                effect: NoSchedule
+                value: "true"
+
+              - key: "nvidia.com/gpu"
+                operator: "Exists"
+                effect: "NoSchedule"
+
+              setHostnameAsFQDN: true
+              volumes:
+              - name: gib
+                hostPath:
+                  path: /home/kubernetes/bin/gib
+              - name: nvidia
+                hostPath:
+                  path: /home/kubernetes/bin/nvidia
+              - name: lib64
+                hostPath:
+                  path: /lib64
+              - name: shared-memory
+                emptyDir:
+                  medium: "Memory"
+                  sizeLimit: 250Gi
+              - name: sys
+                hostPath:
+                  path: /sys
+              - name: proc-sys
+                hostPath:
+                  path: /proc/sys
+              schedulingGates:
+              # Set this to a unique name per job.
+              - name: "gke.io/topology-aware-auto-ag-32"
+
+              initContainers:
+              - name: gpu-healthcheck
+                image: alpine:latest
+                command: ["/bin/sh", "-c"]
+                args:
+                  - |
+                    apk add --no-cache bash  # Install bash
+                    /bin/bash -c "set -ex
+                    NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                    if [ \${NUM_GPUS} -lt 8 ]; then
+                      echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                      exit 1
+                    fi
+                    gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                    for gpu_index in \${!gpu_errors[@]}; do
+                        if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                            echo 'Error: ERR detected in GPU index '\$gpu_index
+                            exit 1
+                        elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                            echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                            exit 1
+                        fi
+                    done
+                    echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+
+                volumeMounts:
+                - name: nvidia
+                  mountPath: /usr/local/nvidia
+                - name: lib64
+                  mountPath: /lib64
+                securityContext:
+                  privileged: true
+                env:
+                - name: LD_LIBRARY_PATH
+                  value: /usr/local/nvidia/lib64
+
+              containers:
+              - name: nccl
+                stdin: true
+                tty: true
+                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+                securityContext:
+                  privileged: true
+                env:
+                - name: MY_NODE_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: spec.nodeName
+                - name: OMPI_ALLOW_RUN_AS_ROOT
+                  value: "1"
+                - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                  value: "1"
+                command:
+                - bash
+                - -c
+                - |
+                  set -x
+                  export N_NODES=32
+                  echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
+
+                  # Load all the cuda libs
+                  /sbin/ldconfig
+
+                  # Install ping
+                  apt update -y
+                  apt install -y iputils-ping
+
+                  # Start sshd
+                  /scripts/container_entry.sh daemon &
+
+                  # Get helper variables to form all hostnames
+                  export POSTFIX=$(hostname | cut -d . -f 2-)
+                  export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                  export NODE_RANK=$JOB_COMPLETION_INDEX
+
+
+                  # For every worker, wait till online and add to hostfile
+                  for i in `seq 0 $(($N_NODES-1))`; do
+                    OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                    until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
+                      echo Waiting for ${OTHER}...
+                      sleep 10
+                    done
+                    echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                  done
+
+                  cat /tmp/hostfile
+
+                  # Launch from head node
+                  if [[ "${NODE_RANK}" -eq "0" ]]; then
+
+                      # World Level = 0x0, Rail Aligned = 0x7
+                      export NCCL_TESTS_SPLIT_MASK="0x0";
+
+                      # Force use of libnccl-gib
+                      export NCCL_NET=gIB
+
+                      # Set all the correct libnccl-gib environment variables
+                      source /usr/local/gib/scripts/set_nccl_env.sh
+
+                      # Get all relevant NCCL / env vars to pass to all workers
+                      ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
+
+                      mpirun --hostfile /tmp/hostfile \
+                        -x $ENV_VARS  \
+                        -mca plm_rsh_no_tree_spawn 1 \
+                        --mca orte_keep_fqdn_hostnames 1 \
+                        --mca btl self,tcp \
+                        --mca btl_tcp_if_include eth0 \
+                        --bind-to none \
+                        --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
+                        /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
+
+                  else
+                      while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                      sleep 5
+                  done
+                  fi
+
+                  exit 0
+
+                volumeMounts:
+                - name: nvidia
+                  mountPath: /usr/local/nvidia
+                - name: gib
+                  mountPath: /usr/local/gib
+                - name: shared-memory
+                  mountPath: /dev/shm
+                resources:
+                  limits:
+                    nvidia.com/gpu: 8
+                  requests:
+                    nvidia.com/gpu: 8
+              restartPolicy: Never
diff --git a/examples/gke-a3-ultragpu/nccl-test.yaml b/examples/gke-a3-ultragpu/nccl-test.yaml
new file mode 100644
index 0000000000..994601472f
--- /dev/null
+++ b/examples/gke-a3-ultragpu/nccl-test.yaml
@@ -0,0 +1,149 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: nccl-host-1
+spec:
+  selector:
+    name: nccl-host-1
+  clusterIP: None
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: nccl-host-2
+spec:
+  selector:
+    name: nccl-host-2
+  clusterIP: None
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nccl-test-host-1
+  labels:
+    name: nccl-host-1
+  annotations:
+    networking.gke.io/default-interface: 'eth0'
+    networking.gke.io/interfaces: |
+      [
+        {"interfaceName":"eth0","network":"default"},
+        {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+        {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+        {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+        {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+        {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+        {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+        {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+        {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+        {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+      ]
+spec:
+  volumes:
+    - name: library-dir-host
+      hostPath:
+        path: /home/kubernetes/bin/nvidia
+    - name: gib
+      hostPath:
+        path: /home/kubernetes/bin/gib
+    - name: shared-memory
+      emptyDir:
+        medium: "Memory"
+        sizeLimit: 250Gi
+  containers:
+    - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+      name: test
+      resources:
+        requests:
+          cpu: 150m
+      volumeMounts:
+        - name: library-dir-host
+          mountPath: /usr/local/nvidia
+        - name: gib
+          mountPath: /usr/local/gib
+        - name: shared-memory
+          mountPath: /dev/shm
+      env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/nvidia/lib64
+      resources:
+        limits:
+          nvidia.com/gpu: 8
+      command: ["/bin/bash", "-c"]
+      args:
+        - |
+          /scripts/container_entry.sh shell
+          source /usr/local/gib/scripts/set_nccl_env.sh
+          sleep infinity
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nccl-test-host-2
+  labels:
+    name: nccl-host-2
+  annotations:
+    networking.gke.io/default-interface: 'eth0'
+    networking.gke.io/interfaces: |
+      [
+        {"interfaceName":"eth0","network":"default"},
+        {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+        {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+        {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+        {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+        {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+        {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+        {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+        {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+        {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+      ]
+spec:
+  volumes:
+    - name: library-dir-host
+      hostPath:
+        path: /home/kubernetes/bin/nvidia
+    - name: gib
+      hostPath:
+        path: /home/kubernetes/bin/gib
+    - name: shared-memory
+      emptyDir:
+        medium: "Memory"
+        sizeLimit: 250Gi
+  containers:
+    - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+      name: test
+      resources:
+        requests:
+          cpu: 150m
+      volumeMounts:
+        - name: library-dir-host
+          mountPath: /usr/local/nvidia
+        - name: gib
+          mountPath: /usr/local/gib
+        - name: shared-memory
+          mountPath: /dev/shm
+      env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/nvidia/lib64
+      resources:
+        limits:
+          nvidia.com/gpu: 8
+      command: ["/bin/bash", "-c"]
+      args:
+        - |
+          /scripts/container_entry.sh shell
+          source /usr/local/gib/scripts/set_nccl_env.sh
+          sleep infinity

From ab6bc3b0c404d5773b62c6e8157d29ea2d7a6a11 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Tue, 17 Dec 2024 06:33:34 +0000
Subject: [PATCH 032/140] Updated blueprint name from gke-storage-parallelstore
 to gke-storage-managed-parallelstore

---
 examples/README.md                                     | 10 +++++-----
 ...ore.yaml => gke-storage-managed-parallelstore.yaml} |  4 ++--
 modules/file-system/gke-storage/README.md              |  2 +-
 ...ore.yaml => gke-storage-managed-parallelstore.yaml} |  6 +++---
 ...store.yml => gke-storage-managed-parallelstore.yml} |  8 ++++----
 5 files changed, 15 insertions(+), 15 deletions(-)
 rename examples/{gke-storage-parallelstore.yaml => gke-storage-managed-parallelstore.yaml} (98%)
 rename tools/cloud-build/daily-tests/builds/{gke-storage-parallelstore.yaml => gke-storage-managed-parallelstore.yaml} (93%)
 rename tools/cloud-build/daily-tests/tests/{gke-storage-parallelstore.yml => gke-storage-managed-parallelstore.yml} (77%)

diff --git a/examples/README.md b/examples/README.md
index b2a7bd7b3c..73272df3cb 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1518,14 +1518,14 @@ cleaned up when the job is deleted.
 
 [storage-gke.yaml]: ../examples/storage-gke.yaml
 
-### [gke-storage-parallelstore.yaml] ![core-badge] ![experimental-badge]
+### [gke-storage-managed-parallelstore.yaml] ![core-badge] ![experimental-badge]
 
-This blueprint shows how to use parallelstore storage options with GKE in the toolkit.
+This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit.
 
 The blueprint contains the following:
 
-* A K8s Job that uses a parallelstore storage volume option.
-* A K8s Job that demonstrates ML training workload with parallelstore storage disk ops.
+* A K8s Job that uses a managed parallelstore storage volume option.
+* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk ops.
 
 > **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module.
 > The lifecycle of the parallelstore is managed by the blueprint.
@@ -1540,7 +1540,7 @@ The blueprint contains the following:
 > `--vars authorized_cidr=<your-ip-address>/32`.** You can use a service like
 > [whatismyip.com](https://whatismyip.com) to determine your IP address.
 
-[gke-storage-parallelstore.yaml]: ../examples/gke-storage-parallelstore.yaml
+[gke-storage-managed-parallelstore.yaml]: ../examples/gke-storage-managed-parallelstore.yaml
 
 ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge]
 
diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-managed-parallelstore.yaml
similarity index 98%
rename from examples/gke-storage-parallelstore.yaml
rename to examples/gke-storage-managed-parallelstore.yaml
index ac8f5773b9..414a2b180d 100644
--- a/examples/gke-storage-parallelstore.yaml
+++ b/examples/gke-storage-managed-parallelstore.yaml
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-blueprint_name: gke-storage-parallelstore
+blueprint_name: gke-storage-managed-parallelstore
 vars:
   project_id:  ## Set GCP Project ID Here ##
-  deployment_name: gke-storage-ps
+  deployment_name: gke-storage-managed-ps
   region: us-central1
   zone: us-central1-c
   # Cidr block containing the IP of the machine calling terraform.
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index 17c718aa37..f4ebd8add0 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -39,7 +39,7 @@ then use them in a `gke-job-template` to dynamically provision the resource.
 ```
 
 See example
-[gke-storage-parallelstore.yaml](../../../examples/README.md#gke-storage-parallelstoreyaml--) blueprint
+[gke-storage-managed-parallelstore.yaml](../../../examples/README.md#gke-storage-managed-parallelstoreyaml--) blueprint
 for a complete example.
 
 ### Authorized Network
diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml
similarity index 93%
rename from tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml
rename to tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml
index a51c8cebab..8fbc9c1794 100644
--- a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml
@@ -27,7 +27,7 @@ timeout: 14400s  # 4hr
 
 steps:
 ## Test GKE
-- id: gke-storage-parallelstore
+- id: gke-storage-managed-parallelstore
   name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
   entrypoint: /bin/bash
   env:
@@ -40,7 +40,7 @@ steps:
     cd /workspace && make
     BUILD_ID_FULL=$BUILD_ID
     BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-    SG_EXAMPLE=examples/gke-storage-parallelstore.yaml
+    SG_EXAMPLE=examples/gke-storage-managed-parallelstore.yaml
 
     # adding vm to act as remote node
     echo '  - id: remote-node'                     >> $${SG_EXAMPLE}
@@ -58,4 +58,4 @@ steps:
 
     ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
       --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml"
+      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml
similarity index 77%
rename from tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml
rename to tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml
index a6de4bf239..bfb8bc32d7 100644
--- a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml
+++ b/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-test_name: gke-storage-parallelstore
-deployment_name: gke-storage-parallelstore-{{ build }}
+test_name: gke-storage-managed-parallelstore
+deployment_name: gke-storage-managed-parallelstore-{{ build }}
 zone: us-central1-a  # for remote node
 region: us-central1
 workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml"
+blueprint_yaml: "{{ workspace }}/examples/gke-storage-managed-parallelstore.yaml"
 network: "{{ deployment_name }}-net"
 remote_node: "{{ deployment_name }}-0"
 post_deploy_tests:
-- test-validation/test-gke-storage-parallelstore.yml
+- test-validation/test-gke-storage-managed-parallelstore.yml
 custom_vars:
   project: "{{ project }}"
 cli_deployment_vars:

From 270ccb756299a9f01c1a3c8b9829c6195dd368d4 Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Tue, 17 Dec 2024 09:38:42 +0000
Subject: [PATCH 033/140] integrating kueue v0.10.0 to enable TAS with rank
 ordering support

---
 .../manifests/kueue-v0.10.0.yaml              | 13184 ++++++++++++++++
 modules/management/kubectl-apply/variables.tf |     2 +-
 .../blueprints/gke-a2-highgpu.yaml            |     2 +-
 3 files changed, 13186 insertions(+), 2 deletions(-)
 create mode 100644 modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml

diff --git a/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml b/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml
new file mode 100644
index 0000000000..696e9b1ffb
--- /dev/null
+++ b/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml
@@ -0,0 +1,13184 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-system
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: admissionchecks.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: AdmissionCheck
+    listKind: AdmissionCheckList
+    plural: admissionchecks
+    singular: admissioncheck
+  scope: Cluster
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: AdmissionCheck is the Schema for the admissionchecks API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: AdmissionCheckSpec defines the desired state of AdmissionCheck
+            properties:
+              controllerName:
+                description: |-
+                  controllerName identifies the controller that processes the AdmissionCheck,
+                  not necessarily a Kubernetes Pod or Deployment name. Cannot be empty.
+                type: string
+                x-kubernetes-validations:
+                - message: field is immutable
+                  rule: self == oldSelf
+              parameters:
+                description: |-
+                  Parameters identifies a configuration with additional parameters for the
+                  check.
+                properties:
+                  apiGroup:
+                    description: ApiGroup is the group for the resource being referenced.
+                    maxLength: 253
+                    pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                    type: string
+                  kind:
+                    description: Kind is the type of the resource being referenced.
+                    maxLength: 63
+                    pattern: ^(?i)[a-z]([-a-z0-9]*[a-z0-9])?$
+                    type: string
+                  name:
+                    description: Name is the name of the resource being referenced.
+                    maxLength: 63
+                    pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
+                    type: string
+                required:
+                - apiGroup
+                - kind
+                - name
+                type: object
+              retryDelayMinutes:
+                default: 15
+                description: |-
+                  RetryDelayMinutes specifies how long to keep the workload suspended after
+                  a failed check (after it transitioned to False). When the delay period has passed, the check
+                  state goes to "Unknown". The default is 15 min.
+                  Deprecated: retryDelayMinutes has already been deprecated since v0.8 and will be removed in v1beta2.
+                format: int64
+                type: integer
+            required:
+            - controllerName
+            type: object
+          status:
+            description: AdmissionCheckStatus defines the observed state of AdmissionCheck
+            properties:
+              conditions:
+                description: |-
+                  conditions hold the latest available observations of the AdmissionCheck
+                  current state.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: clusterqueues.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: ClusterQueue
+    listKind: ClusterQueueList
+    plural: clusterqueues
+    shortNames:
+    - cq
+    singular: clusterqueue
+  scope: Cluster
+  versions:
+  - additionalPrinterColumns:
+    - description: Cohort that this ClusterQueue belongs to
+      jsonPath: .spec.cohort
+      name: Cohort
+      type: string
+    - description: The queueing strategy used to prioritize workloads
+      jsonPath: .spec.queueingStrategy
+      name: Strategy
+      priority: 1
+      type: string
+    - description: Number of pending workloads
+      jsonPath: .status.pendingWorkloads
+      name: Pending Workloads
+      type: integer
+    - description: Number of admitted workloads that haven't finished yet
+      jsonPath: .status.admittedWorkloads
+      name: Admitted Workloads
+      priority: 1
+      type: integer
+    name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: ClusterQueue is the Schema for the clusterQueue API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ClusterQueueSpec defines the desired state of ClusterQueue
+            properties:
+              admissionChecks:
+                description: |-
+                  admissionChecks lists the AdmissionChecks required by this ClusterQueue.
+                  Cannot be used along with AdmissionCheckStrategy.
+                items:
+                  type: string
+                type: array
+              admissionChecksStrategy:
+                description: |-
+                  admissionCheckStrategy defines a list of strategies to determine which ResourceFlavors require AdmissionChecks.
+                  This property cannot be used in conjunction with the 'admissionChecks' property.
+                properties:
+                  admissionChecks:
+                    description: admissionChecks is a list of strategies for AdmissionChecks
+                    items:
+                      description: AdmissionCheckStrategyRule defines rules for a
+                        single AdmissionCheck
+                      properties:
+                        name:
+                          description: name is an AdmissionCheck's name.
+                          type: string
+                        onFlavors:
+                          description: |-
+                            onFlavors is a list of ResourceFlavors' names that this AdmissionCheck should run for.
+                            If empty, the AdmissionCheck will run for all workloads submitted to the ClusterQueue.
+                          items:
+                            description: ResourceFlavorReference is the name of the
+                              ResourceFlavor.
+                            maxLength: 253
+                            pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                            type: string
+                          type: array
+                      required:
+                      - name
+                      type: object
+                    type: array
+                type: object
+              cohort:
+                description: |-
+                  cohort that this ClusterQueue belongs to. CQs that belong to the
+                  same cohort can borrow unused resources from each other.
+
+                  A CQ can be a member of a single borrowing cohort. A workload submitted
+                  to a queue referencing this CQ can borrow quota from any CQ in the cohort.
+                  Only quota for the [resource, flavor] pairs listed in the CQ can be
+                  borrowed.
+                  If empty, this ClusterQueue cannot borrow from any other ClusterQueue and
+                  vice versa.
+
+                  A cohort is a name that links CQs together, but it doesn't reference any
+                  object.
+
+                  Validation of a cohort name is equivalent to that of object names:
+                  subdomain in DNS (RFC 1123).
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+              fairSharing:
+                description: |-
+                  fairSharing defines the properties of the ClusterQueue when participating in fair sharing.
+                  The values are only relevant if fair sharing is enabled in the Kueue configuration.
+                properties:
+                  weight:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    default: 1
+                    description: |-
+                      weight gives a comparative advantage to this ClusterQueue when competing for unused
+                      resources in the cohort against other ClusterQueues.
+                      The share of a ClusterQueue is based on the dominant resource usage above nominal
+                      quotas for each resource, divided by the weight.
+                      Admission prioritizes scheduling workloads from ClusterQueues with the lowest share
+                      and preempting workloads from the ClusterQueues with the highest share.
+                      A zero weight implies infinite share value, meaning that this ClusterQueue will always
+                      be at disadvantage against other ClusterQueues.
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                type: object
+              flavorFungibility:
+                default: {}
+                description: |-
+                  flavorFungibility defines whether a workload should try the next flavor
+                  before borrowing or preempting in the flavor being evaluated.
+                properties:
+                  whenCanBorrow:
+                    default: Borrow
+                    description: |-
+                      whenCanBorrow determines whether a workload should try the next flavor
+                      before borrowing in current flavor. The possible values are:
+
+                      - `Borrow` (default): allocate in current flavor if borrowing
+                        is possible.
+                      - `TryNextFlavor`: try next flavor even if the current
+                        flavor has enough resources to borrow.
+                    enum:
+                    - Borrow
+                    - TryNextFlavor
+                    type: string
+                  whenCanPreempt:
+                    default: TryNextFlavor
+                    description: |-
+                      whenCanPreempt determines whether a workload should try the next flavor
+                      before borrowing in current flavor. The possible values are:
+
+                      - `Preempt`: allocate in current flavor if it's possible to preempt some workloads.
+                      - `TryNextFlavor` (default): try next flavor even if there are enough
+                        candidates for preemption in the current flavor.
+                    enum:
+                    - Preempt
+                    - TryNextFlavor
+                    type: string
+                type: object
+              namespaceSelector:
+                description: |-
+                  namespaceSelector defines which namespaces are allowed to submit workloads to
+                  this clusterQueue. Beyond this basic support for policy, a policy agent like
+                  Gatekeeper should be used to enforce more advanced policies.
+                  Defaults to null which is a nothing selector (no namespaces eligible).
+                  If set to an empty selector `{}`, then all namespaces are eligible.
+                properties:
+                  matchExpressions:
+                    description: matchExpressions is a list of label selector requirements.
+                      The requirements are ANDed.
+                    items:
+                      description: |-
+                        A label selector requirement is a selector that contains values, a key, and an operator that
+                        relates the key and values.
+                      properties:
+                        key:
+                          description: key is the label key that the selector applies
+                            to.
+                          type: string
+                        operator:
+                          description: |-
+                            operator represents a key's relationship to a set of values.
+                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                          type: string
+                        values:
+                          description: |-
+                            values is an array of string values. If the operator is In or NotIn,
+                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                            the values array must be empty. This array is replaced during a strategic
+                            merge patch.
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                      required:
+                      - key
+                      - operator
+                      type: object
+                    type: array
+                    x-kubernetes-list-type: atomic
+                  matchLabels:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                    type: object
+                type: object
+                x-kubernetes-map-type: atomic
+              preemption:
+                default: {}
+                description: |-
+                  preemption describes policies to preempt Workloads from this ClusterQueue
+                  or the ClusterQueue's cohort.
+
+                  Preemption can happen in two scenarios:
+
+                  - When a Workload fits within the nominal quota of the ClusterQueue, but
+                    the quota is currently borrowed by other ClusterQueues in the cohort.
+                    Preempting Workloads in other ClusterQueues allows this ClusterQueue to
+                    reclaim its nominal quota.
+                  - When a Workload doesn't fit within the nominal quota of the ClusterQueue
+                    and there are admitted Workloads in the ClusterQueue with lower priority.
+
+                  The preemption algorithm tries to find a minimal set of Workloads to
+                  preempt to accomomdate the pending Workload, preempting Workloads with
+                  lower priority first.
+                properties:
+                  borrowWithinCohort:
+                    default: {}
+                    description: |-
+                      borrowWithinCohort provides configuration to allow preemption within
+                      cohort while borrowing.
+                    properties:
+                      maxPriorityThreshold:
+                        description: |-
+                          maxPriorityThreshold allows to restrict the set of workloads which
+                          might be preempted by a borrowing workload, to only workloads with
+                          priority less than or equal to the specified threshold priority.
+                          When the threshold is not specified, then any workload satisfying the
+                          policy can be preempted by the borrowing workload.
+                        format: int32
+                        type: integer
+                      policy:
+                        default: Never
+                        description: |-
+                          policy determines the policy for preemption to reclaim quota within cohort while borrowing.
+                          Possible values are:
+                          - `Never` (default): do not allow for preemption, in other
+                             ClusterQueues within the cohort, for a borrowing workload.
+                          - `LowerPriority`: allow preemption, in other ClusterQueues
+                             within the cohort, for a borrowing workload, but only if
+                             the preempted workloads are of lower priority.
+                        enum:
+                        - Never
+                        - LowerPriority
+                        type: string
+                    type: object
+                  reclaimWithinCohort:
+                    default: Never
+                    description: |-
+                      reclaimWithinCohort determines whether a pending Workload can preempt
+                      Workloads from other ClusterQueues in the cohort that are using more than
+                      their nominal quota. The possible values are:
+
+                      - `Never` (default): do not preempt Workloads in the cohort.
+                      - `LowerPriority`: **Classic Preemption** if the pending Workload
+                        fits within the nominal quota of its ClusterQueue, only preempt
+                        Workloads in the cohort that have lower priority than the pending
+                        Workload. **Fair Sharing** only preempt Workloads in the cohort that
+                        have lower priority than the pending Workload and that satisfy the
+                        fair sharing preemptionStategies.
+                      - `Any`: **Classic Preemption** if the pending Workload fits within
+                         the nominal quota of its ClusterQueue, preempt any Workload in the
+                         cohort, irrespective of priority. **Fair Sharing** preempt Workloads
+                         in the cohort that satisfy the fair sharing preemptionStrategies.
+                    enum:
+                    - Never
+                    - LowerPriority
+                    - Any
+                    type: string
+                  withinClusterQueue:
+                    default: Never
+                    description: |-
+                      withinClusterQueue determines whether a pending Workload that doesn't fit
+                      within the nominal quota for its ClusterQueue, can preempt active Workloads in
+                      the ClusterQueue. The possible values are:
+
+                      - `Never` (default): do not preempt Workloads in the ClusterQueue.
+                      - `LowerPriority`: only preempt Workloads in the ClusterQueue that have
+                        lower priority than the pending Workload.
+                      - `LowerOrNewerEqualPriority`: only preempt Workloads in the ClusterQueue that
+                        either have a lower priority than the pending workload or equal priority
+                        and are newer than the pending workload.
+                    enum:
+                    - Never
+                    - LowerPriority
+                    - LowerOrNewerEqualPriority
+                    type: string
+                type: object
+                x-kubernetes-validations:
+                - message: reclaimWithinCohort=Never and borrowWithinCohort.Policy!=Never
+                  rule: '!(self.reclaimWithinCohort == ''Never'' && has(self.borrowWithinCohort)
+                    &&  self.borrowWithinCohort.policy != ''Never'')'
+              queueingStrategy:
+                default: BestEffortFIFO
+                description: |-
+                  QueueingStrategy indicates the queueing strategy of the workloads
+                  across the queues in this ClusterQueue.
+                  Current Supported Strategies:
+
+                  - StrictFIFO: workloads are ordered strictly by creation time.
+                  Older workloads that can't be admitted will block admitting newer
+                  workloads even if they fit available quota.
+                  - BestEffortFIFO: workloads are ordered by creation time,
+                  however older workloads that can't be admitted will not block
+                  admitting newer workloads that fit existing quota.
+                enum:
+                - StrictFIFO
+                - BestEffortFIFO
+                type: string
+              resourceGroups:
+                description: |-
+                  resourceGroups describes groups of resources.
+                  Each resource group defines the list of resources and a list of flavors
+                  that provide quotas for these resources.
+                  Each resource and each flavor can only form part of one resource group.
+                  resourceGroups can be up to 16.
+                items:
+                  properties:
+                    coveredResources:
+                      description: |-
+                        coveredResources is the list of resources covered by the flavors in this
+                        group.
+                        Examples: cpu, memory, vendor.com/gpu.
+                        The list cannot be empty and it can contain up to 16 resources.
+                      items:
+                        description: ResourceName is the name identifying various
+                          resources in a ResourceList.
+                        type: string
+                      maxItems: 16
+                      minItems: 1
+                      type: array
+                    flavors:
+                      description: |-
+                        flavors is the list of flavors that provide the resources of this group.
+                        Typically, different flavors represent different hardware models
+                        (e.g., gpu models, cpu architectures) or pricing models (on-demand vs spot
+                        cpus).
+                        Each flavor MUST list all the resources listed for this group in the same
+                        order as the .resources field.
+                        The list cannot be empty and it can contain up to 16 flavors.
+                      items:
+                        properties:
+                          name:
+                            description: |-
+                              name of this flavor. The name should match the .metadata.name of a
+                              ResourceFlavor. If a matching ResourceFlavor does not exist, the
+                              ClusterQueue will have an Active condition set to False.
+                            maxLength: 253
+                            pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                            type: string
+                          resources:
+                            description: |-
+                              resources is the list of quotas for this flavor per resource.
+                              There could be up to 16 resources.
+                            items:
+                              properties:
+                                borrowingLimit:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    borrowingLimit is the maximum amount of quota for the [flavor, resource]
+                                    combination that this ClusterQueue is allowed to borrow from the unused
+                                    quota of other ClusterQueues in the same cohort.
+                                    In total, at a given time, Workloads in a ClusterQueue can consume a
+                                    quantity of quota equal to nominalQuota+borrowingLimit, assuming the other
+                                    ClusterQueues in the cohort have enough unused quota.
+                                    If null, it means that there is no borrowing limit.
+                                    If not null, it must be non-negative.
+                                    borrowingLimit must be null if spec.cohort is empty.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                lendingLimit:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    lendingLimit is the maximum amount of unused quota for the [flavor, resource]
+                                    combination that this ClusterQueue can lend to other ClusterQueues in the same cohort.
+                                    In total, at a given time, ClusterQueue reserves for its exclusive use
+                                    a quantity of quota equals to nominalQuota - lendingLimit.
+                                    If null, it means that there is no lending limit, meaning that
+                                    all the nominalQuota can be borrowed by other clusterQueues in the cohort.
+                                    If not null, it must be non-negative.
+                                    lendingLimit must be null if spec.cohort is empty.
+                                    This field is in beta stage and is enabled by default.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                name:
+                                  description: name of this resource.
+                                  type: string
+                                nominalQuota:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    nominalQuota is the quantity of this resource that is available for
+                                    Workloads admitted by this ClusterQueue at a point in time.
+                                    The nominalQuota must be non-negative.
+                                    nominalQuota should represent the resources in the cluster available for
+                                    running jobs (after discounting resources consumed by system components
+                                    and pods not managed by kueue). In an autoscaled cluster, nominalQuota
+                                    should account for resources that can be provided by a component such as
+                                    Kubernetes cluster-autoscaler.
+
+                                    If the ClusterQueue belongs to a cohort, the sum of the quotas for each
+                                    (flavor, resource) combination defines the maximum quantity that can be
+                                    allocated by a ClusterQueue in the cohort.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - name
+                              - nominalQuota
+                              type: object
+                            maxItems: 16
+                            minItems: 1
+                            type: array
+                            x-kubernetes-list-map-keys:
+                            - name
+                            x-kubernetes-list-type: map
+                        required:
+                        - name
+                        - resources
+                        type: object
+                      maxItems: 16
+                      minItems: 1
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - coveredResources
+                  - flavors
+                  type: object
+                  x-kubernetes-validations:
+                  - message: flavors must have the same number of resources as the
+                      coveredResources
+                    rule: self.flavors.all(x, size(x.resources) == size(self.coveredResources))
+                maxItems: 16
+                type: array
+                x-kubernetes-list-type: atomic
+              stopPolicy:
+                default: None
+                description: |-
+                  stopPolicy - if set to a value different from None, the ClusterQueue is considered Inactive, no new reservation being
+                  made.
+
+                  Depending on its value, its associated workloads will:
+
+                  - None - Workloads are admitted
+                  - HoldAndDrain - Admitted workloads are evicted and Reserving workloads will cancel the reservation.
+                  - Hold - Admitted workloads will run to completion and Reserving workloads will cancel the reservation.
+                enum:
+                - None
+                - Hold
+                - HoldAndDrain
+                type: string
+            type: object
+            x-kubernetes-validations:
+            - message: borrowingLimit must be nil when cohort is empty
+              rule: '!has(self.cohort) && has(self.resourceGroups) ? self.resourceGroups.all(rg,
+                rg.flavors.all(f, f.resources.all(r, !has(r.borrowingLimit)))) : true'
+          status:
+            description: ClusterQueueStatus defines the observed state of ClusterQueue
+            properties:
+              admittedWorkloads:
+                description: |-
+                  admittedWorkloads is the number of workloads currently admitted to this
+                  clusterQueue and haven't finished yet.
+                format: int32
+                type: integer
+              conditions:
+                description: |-
+                  conditions hold the latest available observations of the ClusterQueue
+                  current state.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+              fairSharing:
+                description: FairSharing contains the information about the current
+                  status of fair sharing.
+                properties:
+                  weightedShare:
+                    description: |-
+                      WeightedShare represent the maximum of the ratios of usage above nominal
+                      quota to the lendable resources in the cohort, among all the resources
+                      provided by the ClusterQueue, and divided by the weight.
+                      If zero, it means that the usage of the ClusterQueue is below the nominal quota.
+                      If the ClusterQueue has a weight of zero, this will return 9223372036854775807,
+                      the maximum possible share value.
+                    format: int64
+                    type: integer
+                required:
+                - weightedShare
+                type: object
+              flavorsReservation:
+                description: |-
+                  flavorsReservation are the reserved quotas, by flavor, currently in use by the
+                  workloads assigned to this ClusterQueue.
+                items:
+                  properties:
+                    name:
+                      description: name of the flavor.
+                      maxLength: 253
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                      type: string
+                    resources:
+                      description: resources lists the quota usage for the resources
+                        in this flavor.
+                      items:
+                        properties:
+                          borrowed:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: |-
+                              Borrowed is quantity of quota that is borrowed from the cohort. In other
+                              words, it's the used quota that is over the nominalQuota.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          name:
+                            description: name of the resource
+                            type: string
+                          total:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: |-
+                              total is the total quantity of used quota, including the amount borrowed
+                              from the cohort.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                        required:
+                        - name
+                        type: object
+                      maxItems: 16
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - name
+                  - resources
+                  type: object
+                maxItems: 16
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              flavorsUsage:
+                description: |-
+                  flavorsUsage are the used quotas, by flavor, currently in use by the
+                  workloads admitted in this ClusterQueue.
+                items:
+                  properties:
+                    name:
+                      description: name of the flavor.
+                      maxLength: 253
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                      type: string
+                    resources:
+                      description: resources lists the quota usage for the resources
+                        in this flavor.
+                      items:
+                        properties:
+                          borrowed:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: |-
+                              Borrowed is quantity of quota that is borrowed from the cohort. In other
+                              words, it's the used quota that is over the nominalQuota.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          name:
+                            description: name of the resource
+                            type: string
+                          total:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: |-
+                              total is the total quantity of used quota, including the amount borrowed
+                              from the cohort.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                        required:
+                        - name
+                        type: object
+                      maxItems: 16
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - name
+                  - resources
+                  type: object
+                maxItems: 16
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              pendingWorkloads:
+                description: |-
+                  pendingWorkloads is the number of workloads currently waiting to be
+                  admitted to this clusterQueue.
+                format: int32
+                type: integer
+              pendingWorkloadsStatus:
+                description: |-
+                  PendingWorkloadsStatus contains the information exposed about the current
+                  status of the pending workloads in the cluster queue.
+                  Deprecated: This field will be removed on v1beta2, use VisibilityOnDemand
+                  (https://kueue.sigs.k8s.io/docs/tasks/manage/monitor_pending_workloads/pending_workloads_on_demand/)
+                  instead.
+                properties:
+                  clusterQueuePendingWorkload:
+                    description: Head contains the list of top pending workloads.
+                    items:
+                      description: |-
+                        ClusterQueuePendingWorkload contains the information identifying a pending workload
+                        in the cluster queue.
+                      properties:
+                        name:
+                          description: Name indicates the name of the pending workload.
+                          type: string
+                        namespace:
+                          description: Namespace indicates the name of the pending
+                            workload.
+                          type: string
+                      required:
+                      - name
+                      - namespace
+                      type: object
+                    type: array
+                    x-kubernetes-list-type: atomic
+                  lastChangeTime:
+                    description: LastChangeTime indicates the time of the last change
+                      of the structure.
+                    format: date-time
+                    type: string
+                required:
+                - lastChangeTime
+                type: object
+              reservingWorkloads:
+                description: |-
+                  reservingWorkloads is the number of workloads currently reserving quota in this
+                  clusterQueue.
+                format: int32
+                type: integer
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: cohorts.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: Cohort
+    listKind: CohortList
+    plural: cohorts
+    singular: cohort
+  scope: Cluster
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: |-
+          Cohort is the Schema for the cohorts API. Using Hierarchical
+          Cohorts (any Cohort which has a parent) with Fair Sharing
+          results in undefined behavior in 0.9
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: CohortSpec defines the desired state of Cohort
+            properties:
+              parent:
+                description: |-
+                  Parent references the name of the Cohort's parent, if
+                  any. It satisfies one of three cases:
+                  1) Unset. This Cohort is the root of its Cohort tree.
+                  2) References a non-existent Cohort. We use default Cohort (no borrowing/lending limits).
+                  3) References an existent Cohort.
+
+                  If a cycle is created, we disable all members of the
+                  Cohort, including ClusterQueues, until the cycle is
+                  removed.  We prevent further admission while the cycle
+                  exists.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+              resourceGroups:
+                description: |-
+                  ResourceGroups describes groupings of Resources and
+                  Flavors.  Each ResourceGroup defines a list of Resources
+                  and a list of Flavors which provide quotas for these
+                  Resources. Each Resource and each Flavor may only form part
+                  of one ResourceGroup.  There may be up to 16 ResourceGroups
+                  within a Cohort.
+
+                  BorrowingLimit limits how much members of this Cohort
+                  subtree can borrow from the parent subtree.
+
+                  LendingLimit limits how much members of this Cohort subtree
+                  can lend to the parent subtree.
+
+                  Borrowing and Lending limits must only be set when the
+                  Cohort has a parent.  Otherwise, the Cohort create/update
+                  will be rejected by the webhook.
+                items:
+                  properties:
+                    coveredResources:
+                      description: |-
+                        coveredResources is the list of resources covered by the flavors in this
+                        group.
+                        Examples: cpu, memory, vendor.com/gpu.
+                        The list cannot be empty and it can contain up to 16 resources.
+                      items:
+                        description: ResourceName is the name identifying various
+                          resources in a ResourceList.
+                        type: string
+                      maxItems: 16
+                      minItems: 1
+                      type: array
+                    flavors:
+                      description: |-
+                        flavors is the list of flavors that provide the resources of this group.
+                        Typically, different flavors represent different hardware models
+                        (e.g., gpu models, cpu architectures) or pricing models (on-demand vs spot
+                        cpus).
+                        Each flavor MUST list all the resources listed for this group in the same
+                        order as the .resources field.
+                        The list cannot be empty and it can contain up to 16 flavors.
+                      items:
+                        properties:
+                          name:
+                            description: |-
+                              name of this flavor. The name should match the .metadata.name of a
+                              ResourceFlavor. If a matching ResourceFlavor does not exist, the
+                              ClusterQueue will have an Active condition set to False.
+                            maxLength: 253
+                            pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                            type: string
+                          resources:
+                            description: |-
+                              resources is the list of quotas for this flavor per resource.
+                              There could be up to 16 resources.
+                            items:
+                              properties:
+                                borrowingLimit:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    borrowingLimit is the maximum amount of quota for the [flavor, resource]
+                                    combination that this ClusterQueue is allowed to borrow from the unused
+                                    quota of other ClusterQueues in the same cohort.
+                                    In total, at a given time, Workloads in a ClusterQueue can consume a
+                                    quantity of quota equal to nominalQuota+borrowingLimit, assuming the other
+                                    ClusterQueues in the cohort have enough unused quota.
+                                    If null, it means that there is no borrowing limit.
+                                    If not null, it must be non-negative.
+                                    borrowingLimit must be null if spec.cohort is empty.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                lendingLimit:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    lendingLimit is the maximum amount of unused quota for the [flavor, resource]
+                                    combination that this ClusterQueue can lend to other ClusterQueues in the same cohort.
+                                    In total, at a given time, ClusterQueue reserves for its exclusive use
+                                    a quantity of quota equals to nominalQuota - lendingLimit.
+                                    If null, it means that there is no lending limit, meaning that
+                                    all the nominalQuota can be borrowed by other clusterQueues in the cohort.
+                                    If not null, it must be non-negative.
+                                    lendingLimit must be null if spec.cohort is empty.
+                                    This field is in beta stage and is enabled by default.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                name:
+                                  description: name of this resource.
+                                  type: string
+                                nominalQuota:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: |-
+                                    nominalQuota is the quantity of this resource that is available for
+                                    Workloads admitted by this ClusterQueue at a point in time.
+                                    The nominalQuota must be non-negative.
+                                    nominalQuota should represent the resources in the cluster available for
+                                    running jobs (after discounting resources consumed by system components
+                                    and pods not managed by kueue). In an autoscaled cluster, nominalQuota
+                                    should account for resources that can be provided by a component such as
+                                    Kubernetes cluster-autoscaler.
+
+                                    If the ClusterQueue belongs to a cohort, the sum of the quotas for each
+                                    (flavor, resource) combination defines the maximum quantity that can be
+                                    allocated by a ClusterQueue in the cohort.
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - name
+                              - nominalQuota
+                              type: object
+                            maxItems: 16
+                            minItems: 1
+                            type: array
+                            x-kubernetes-list-map-keys:
+                            - name
+                            x-kubernetes-list-type: map
+                        required:
+                        - name
+                        - resources
+                        type: object
+                      maxItems: 16
+                      minItems: 1
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - coveredResources
+                  - flavors
+                  type: object
+                  x-kubernetes-validations:
+                  - message: flavors must have the same number of resources as the
+                      coveredResources
+                    rule: self.flavors.all(x, size(x.resources) == size(self.coveredResources))
+                maxItems: 16
+                type: array
+                x-kubernetes-list-type: atomic
+            type: object
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: localqueues.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: LocalQueue
+    listKind: LocalQueueList
+    plural: localqueues
+    shortNames:
+    - queue
+    - queues
+    - lq
+    singular: localqueue
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - description: Backing ClusterQueue
+      jsonPath: .spec.clusterQueue
+      name: ClusterQueue
+      type: string
+    - description: Number of pending workloads
+      jsonPath: .status.pendingWorkloads
+      name: Pending Workloads
+      type: integer
+    - description: Number of admitted workloads that haven't finished yet.
+      jsonPath: .status.admittedWorkloads
+      name: Admitted Workloads
+      type: integer
+    name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: LocalQueue is the Schema for the localQueues API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: LocalQueueSpec defines the desired state of LocalQueue
+            properties:
+              clusterQueue:
+                description: clusterQueue is a reference to a clusterQueue that backs
+                  this localQueue.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+                x-kubernetes-validations:
+                - message: field is immutable
+                  rule: self == oldSelf
+              stopPolicy:
+                default: None
+                description: |-
+                  stopPolicy - if set to a value different from None, the LocalQueue is considered Inactive,
+                  no new reservation being made.
+
+                  Depending on its value, its associated workloads will:
+
+                  - None - Workloads are admitted
+                  - HoldAndDrain - Admitted workloads are evicted and Reserving workloads will cancel the reservation.
+                  - Hold - Admitted workloads will run to completion and Reserving workloads will cancel the reservation.
+                enum:
+                - None
+                - Hold
+                - HoldAndDrain
+                type: string
+            type: object
+          status:
+            description: LocalQueueStatus defines the observed state of LocalQueue
+            properties:
+              admittedWorkloads:
+                description: |-
+                  admittedWorkloads is the number of workloads in this LocalQueue
+                  admitted to a ClusterQueue and that haven't finished yet.
+                format: int32
+                type: integer
+              conditions:
+                description: |-
+                  Conditions hold the latest available observations of the LocalQueue
+                  current state.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+              flavorUsage:
+                description: |-
+                  flavorsUsage are the used quotas, by flavor currently in use by the
+                  workloads assigned to this LocalQueue.
+                items:
+                  properties:
+                    name:
+                      description: name of the flavor.
+                      maxLength: 253
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                      type: string
+                    resources:
+                      description: resources lists the quota usage for the resources
+                        in this flavor.
+                      items:
+                        properties:
+                          name:
+                            description: name of the resource.
+                            type: string
+                          total:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: total is the total quantity of used quota.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                        required:
+                        - name
+                        type: object
+                      maxItems: 16
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - name
+                  - resources
+                  type: object
+                maxItems: 16
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              flavors:
+                description: flavors lists all currently available ResourceFlavors
+                  in specified ClusterQueue.
+                items:
+                  properties:
+                    name:
+                      description: name of the flavor.
+                      maxLength: 253
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                      type: string
+                    nodeLabels:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        nodeLabels are labels that associate the ResourceFlavor with Nodes that
+                        have the same labels.
+                      maxProperties: 8
+                      type: object
+                      x-kubernetes-map-type: atomic
+                    nodeTaints:
+                      description: |-
+                        nodeTaints are taints that the nodes associated with this ResourceFlavor
+                        have.
+                      items:
+                        description: |-
+                          The node this Taint is attached to has the "effect" on
+                          any pod that does not tolerate the Taint.
+                        properties:
+                          effect:
+                            description: |-
+                              Required. The effect of the taint on pods
+                              that do not tolerate the taint.
+                              Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
+                            type: string
+                          key:
+                            description: Required. The taint key to be applied to
+                              a node.
+                            type: string
+                          timeAdded:
+                            description: |-
+                              TimeAdded represents the time at which the taint was added.
+                              It is only written for NoExecute taints.
+                            format: date-time
+                            type: string
+                          value:
+                            description: The taint value corresponding to the taint
+                              key.
+                            type: string
+                        required:
+                        - effect
+                        - key
+                        type: object
+                      maxItems: 8
+                      type: array
+                      x-kubernetes-list-type: atomic
+                    resources:
+                      description: resources used in the flavor.
+                      items:
+                        description: ResourceName is the name identifying various
+                          resources in a ResourceList.
+                        type: string
+                      maxItems: 16
+                      type: array
+                      x-kubernetes-list-type: set
+                  required:
+                  - name
+                  type: object
+                maxItems: 16
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              flavorsReservation:
+                description: |-
+                  flavorsReservation are the reserved quotas, by flavor currently in use by the
+                  workloads assigned to this LocalQueue.
+                items:
+                  properties:
+                    name:
+                      description: name of the flavor.
+                      maxLength: 253
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                      type: string
+                    resources:
+                      description: resources lists the quota usage for the resources
+                        in this flavor.
+                      items:
+                        properties:
+                          name:
+                            description: name of the resource.
+                            type: string
+                          total:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            description: total is the total quantity of used quota.
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                        required:
+                        - name
+                        type: object
+                      maxItems: 16
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - name
+                      x-kubernetes-list-type: map
+                  required:
+                  - name
+                  - resources
+                  type: object
+                maxItems: 16
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              pendingWorkloads:
+                description: PendingWorkloads is the number of Workloads in the LocalQueue
+                  not yet admitted to a ClusterQueue
+                format: int32
+                type: integer
+              reservingWorkloads:
+                description: |-
+                  reservingWorkloads is the number of workloads in this LocalQueue
+                  reserving quota in a ClusterQueue and that haven't finished yet.
+                format: int32
+                type: integer
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: multikueueclusters.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: MultiKueueCluster
+    listKind: MultiKueueClusterList
+    plural: multikueueclusters
+    singular: multikueuecluster
+  scope: Cluster
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: MultiKueueCluster is the Schema for the multikueue API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            properties:
+              kubeConfig:
+                description: Information how to connect to the cluster.
+                properties:
+                  location:
+                    description: |-
+                      Location of the KubeConfig.
+
+                      If LocationType is Secret then Location is the name of the secret inside the namespace in
+                      which the kueue controller manager is running. The config should be stored in the "kubeconfig" key.
+                    type: string
+                  locationType:
+                    default: Secret
+                    description: Type of the KubeConfig location.
+                    enum:
+                    - Secret
+                    - Path
+                    type: string
+                required:
+                - location
+                - locationType
+                type: object
+            required:
+            - kubeConfig
+            type: object
+          status:
+            properties:
+              conditions:
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: multikueueconfigs.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: MultiKueueConfig
+    listKind: MultiKueueConfigList
+    plural: multikueueconfigs
+    singular: multikueueconfig
+  scope: Cluster
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: MultiKueueConfig is the Schema for the multikueue API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: MultiKueueConfigSpec defines the desired state of MultiKueueConfig
+            properties:
+              clusters:
+                description: List of MultiKueueClusters names where the workloads
+                  from the ClusterQueue should be distributed.
+                items:
+                  type: string
+                maxItems: 10
+                minItems: 1
+                type: array
+                x-kubernetes-list-type: set
+            required:
+            - clusters
+            type: object
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: provisioningrequestconfigs.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: ProvisioningRequestConfig
+    listKind: ProvisioningRequestConfigList
+    plural: provisioningrequestconfigs
+    singular: provisioningrequestconfig
+  scope: Cluster
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: ProvisioningRequestConfig is the Schema for the provisioningrequestconfig
+          API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ProvisioningRequestConfigSpec defines the desired state of
+              ProvisioningRequestConfig
+            properties:
+              managedResources:
+                description: |-
+                  managedResources contains the list of resources managed by the autoscaling.
+
+                  If empty, all resources are considered managed.
+
+                  If not empty, the ProvisioningRequest will contain only the podsets that are
+                  requesting at least one of them.
+
+                  If none of the workloads podsets is requesting at least a managed resource,
+                  the workload is considered ready.
+                items:
+                  description: ResourceName is the name identifying various resources
+                    in a ResourceList.
+                  type: string
+                maxItems: 100
+                type: array
+                x-kubernetes-list-type: set
+              parameters:
+                additionalProperties:
+                  description: Parameter is limited to 255 characters.
+                  maxLength: 255
+                  type: string
+                description: Parameters contains all other parameters classes may
+                  require.
+                maxProperties: 100
+                type: object
+              provisioningClassName:
+                description: |-
+                  ProvisioningClassName describes the different modes of provisioning the resources.
+                  Check autoscaling.x-k8s.io ProvisioningRequestSpec.ProvisioningClassName for details.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+              retryStrategy:
+                default:
+                  backoffBaseSeconds: 60
+                  backoffLimitCount: 3
+                  backoffMaxSeconds: 1800
+                description: |-
+                  retryStrategy defines strategy for retrying ProvisioningRequest.
+                  If null, then the default configuration is applied with the following parameter values:
+                  backoffLimitCount:  3
+                  backoffBaseSeconds: 60 - 1 min
+                  backoffMaxSeconds:  1800 - 30 mins
+
+                  To switch off retry mechanism
+                  set retryStrategy.backoffLimitCount to 0.
+                properties:
+                  backoffBaseSeconds:
+                    default: 60
+                    description: |-
+                      BackoffBaseSeconds defines the base for the exponential backoff for
+                      re-queuing an evicted workload.
+
+                      Defaults to 60.
+                    format: int32
+                    type: integer
+                  backoffLimitCount:
+                    default: 3
+                    description: |-
+                      BackoffLimitCount defines the maximum number of re-queuing retries.
+                      Once the number is reached, the workload is deactivated (`.spec.activate`=`false`).
+
+                      Every backoff duration is about "b*2^(n-1)+Rand" where:
+                      - "b" represents the base set by "BackoffBaseSeconds" parameter,
+                      - "n" represents the "workloadStatus.requeueState.count",
+                      - "Rand" represents the random jitter.
+                      During this time, the workload is taken as an inadmissible and
+                      other workloads will have a chance to be admitted.
+                      By default, the consecutive requeue delays are around: (60s, 120s, 240s, ...).
+
+                      Defaults to 3.
+                    format: int32
+                    type: integer
+                  backoffMaxSeconds:
+                    default: 1800
+                    description: |-
+                      BackoffMaxSeconds defines the maximum backoff time to re-queue an evicted workload.
+
+                      Defaults to 1800.
+                    format: int32
+                    type: integer
+                type: object
+            required:
+            - provisioningClassName
+            type: object
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: resourceflavors.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: ResourceFlavor
+    listKind: ResourceFlavorList
+    plural: resourceflavors
+    shortNames:
+    - flavor
+    - flavors
+    - rf
+    singular: resourceflavor
+  scope: Cluster
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: ResourceFlavor is the Schema for the resourceflavors API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ResourceFlavorSpec defines the desired state of the ResourceFlavor
+            properties:
+              nodeLabels:
+                additionalProperties:
+                  type: string
+                description: |-
+                  nodeLabels are labels that associate the ResourceFlavor with Nodes that
+                  have the same labels.
+                  When a Workload is admitted, its podsets can only get assigned
+                  ResourceFlavors whose nodeLabels match the nodeSelector and nodeAffinity
+                  fields.
+                  Once a ResourceFlavor is assigned to a podSet, the ResourceFlavor's
+                  nodeLabels should be injected into the pods of the Workload by the
+                  controller that integrates with the Workload object.
+
+                  nodeLabels can be up to 8 elements.
+                maxProperties: 8
+                type: object
+                x-kubernetes-map-type: atomic
+              nodeTaints:
+                description: |-
+                  nodeTaints are taints that the nodes associated with this ResourceFlavor
+                  have.
+                  Workloads' podsets must have tolerations for these nodeTaints in order to
+                  get assigned this ResourceFlavor during admission.
+
+                  An example of a nodeTaint is
+                  cloud.provider.com/preemptible="true":NoSchedule
+
+                  nodeTaints can be up to 8 elements.
+                items:
+                  description: |-
+                    The node this Taint is attached to has the "effect" on
+                    any pod that does not tolerate the Taint.
+                  properties:
+                    effect:
+                      description: |-
+                        Required. The effect of the taint on pods
+                        that do not tolerate the taint.
+                        Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
+                      type: string
+                    key:
+                      description: Required. The taint key to be applied to a node.
+                      type: string
+                    timeAdded:
+                      description: |-
+                        TimeAdded represents the time at which the taint was added.
+                        It is only written for NoExecute taints.
+                      format: date-time
+                      type: string
+                    value:
+                      description: The taint value corresponding to the taint key.
+                      type: string
+                  required:
+                  - effect
+                  - key
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-type: atomic
+                x-kubernetes-validations:
+                - message: 'supported taint effect values: ''NoSchedule'', ''PreferNoSchedule'',
+                    ''NoExecute'''
+                  rule: self.all(x, x.effect in ['NoSchedule', 'PreferNoSchedule',
+                    'NoExecute'])
+              tolerations:
+                description: |-
+                  tolerations are extra tolerations that will be added to the pods admitted in
+                  the quota associated with this resource flavor.
+
+                  An example of a toleration is
+                  cloud.provider.com/preemptible="true":NoSchedule
+
+                  tolerations can be up to 8 elements.
+                items:
+                  description: |-
+                    The pod this Toleration is attached to tolerates any taint that matches
+                    the triple <key,value,effect> using the matching operator <operator>.
+                  properties:
+                    effect:
+                      description: |-
+                        Effect indicates the taint effect to match. Empty means match all taint effects.
+                        When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                      type: string
+                    key:
+                      description: |-
+                        Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                        If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                      type: string
+                    operator:
+                      description: |-
+                        Operator represents a key's relationship to the value.
+                        Valid operators are Exists and Equal. Defaults to Equal.
+                        Exists is equivalent to wildcard for value, so that a pod can
+                        tolerate all taints of a particular category.
+                      type: string
+                    tolerationSeconds:
+                      description: |-
+                        TolerationSeconds represents the period of time the toleration (which must be
+                        of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                        it is not set, which means tolerate the taint forever (do not evict). Zero and
+                        negative values will be treated as 0 (evict immediately) by the system.
+                      format: int64
+                      type: integer
+                    value:
+                      description: |-
+                        Value is the taint value the toleration matches to.
+                        If the operator is Exists, the value should be empty, otherwise just a regular string.
+                      type: string
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-type: atomic
+                x-kubernetes-validations:
+                - message: operator must be Exists when 'key' is empty, which means
+                    'match all values and all keys'
+                  rule: 'self.all(x, !has(x.key) ? x.operator == ''Exists'' : true)'
+                - message: effect must be 'NoExecute' when 'tolerationSeconds' is
+                    set
+                  rule: 'self.all(x, has(x.tolerationSeconds) ? x.effect == ''NoExecute''
+                    : true)'
+                - message: 'supported toleration values: ''Equal''(default), ''Exists'''
+                  rule: self.all(x, !has(x.operator) || x.operator in ['Equal', 'Exists'])
+                - message: a value must be empty when 'operator' is 'Exists'
+                  rule: 'self.all(x, has(x.operator) && x.operator == ''Exists'' ?
+                    !has(x.value) : true)'
+                - message: 'supported taint effect values: ''NoSchedule'', ''PreferNoSchedule'',
+                    ''NoExecute'''
+                  rule: self.all(x, !has(x.effect) || x.effect in ['NoSchedule', 'PreferNoSchedule',
+                    'NoExecute'])
+              topologyName:
+                description: |-
+                  topologyName indicates topology for the TAS ResourceFlavor.
+                  When specified, it enables scraping of the topology information from the
+                  nodes matching to the Resource Flavor node labels.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+            type: object
+            x-kubernetes-validations:
+            - message: at least one nodeLabel is required when topology is set
+              rule: '!has(self.topologyName) || self.nodeLabels.size() >= 1'
+            - message: resourceFlavorSpec are immutable when topologyName is set
+              rule: '!has(oldSelf.topologyName) || self == oldSelf'
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: topologies.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: Topology
+    listKind: TopologyList
+    plural: topologies
+    singular: topology
+  scope: Cluster
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: Topology is the Schema for the topology API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: TopologySpec defines the desired state of Topology
+            properties:
+              levels:
+                description: levels define the levels of topology.
+                items:
+                  description: TopologyLevel defines the desired state of TopologyLevel
+                  properties:
+                    nodeLabel:
+                      description: |-
+                        nodeLabel indicates the name of the node label for a specific topology
+                        level.
+
+                        Examples:
+                        - cloud.provider.com/topology-block
+                        - cloud.provider.com/topology-rack
+                      maxLength: 316
+                      minLength: 1
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - nodeLabel
+                  type: object
+                maxItems: 8
+                minItems: 1
+                type: array
+                x-kubernetes-list-type: atomic
+                x-kubernetes-validations:
+                - message: field is immutable
+                  rule: self == oldSelf
+                - message: must be unique
+                  rule: size(self.filter(i, size(self.filter(j, j == i)) > 1)) ==
+                    0
+                - message: the kubernetes.io/hostname label can only be used at the
+                    lowest level of topology
+                  rule: size(self.filter(i, i.nodeLabel == 'kubernetes.io/hostname'))
+                    == 0 || self[size(self) - 1].nodeLabel == 'kubernetes.io/hostname'
+            required:
+            - levels
+            type: object
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: workloadpriorityclasses.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: WorkloadPriorityClass
+    listKind: WorkloadPriorityClassList
+    plural: workloadpriorityclasses
+    singular: workloadpriorityclass
+  scope: Cluster
+  versions:
+  - additionalPrinterColumns:
+    - description: Value of workloadPriorityClass's Priority
+      jsonPath: .value
+      name: Value
+      type: integer
+    name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: WorkloadPriorityClass is the Schema for the workloadPriorityClass
+          API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          description:
+            description: |-
+              description is an arbitrary string that usually provides guidelines on
+              when this workloadPriorityClass should be used.
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          value:
+            description: |-
+              value represents the integer value of this workloadPriorityClass. This is the actual priority that workloads
+              receive when jobs have the name of this class in their workloadPriorityClass label.
+              Changing the value of workloadPriorityClass doesn't affect the priority of workloads that were already created.
+            format: int32
+            type: integer
+        required:
+        - value
+        type: object
+    served: true
+    storage: true
+    subresources: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
+    controller-gen.kubebuilder.io/version: v0.16.5
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: workloads.kueue.x-k8s.io
+spec:
+  group: kueue.x-k8s.io
+  names:
+    kind: Workload
+    listKind: WorkloadList
+    plural: workloads
+    shortNames:
+    - wl
+    singular: workload
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - description: Name of the queue this workload was submitted to
+      jsonPath: .spec.queueName
+      name: Queue
+      type: string
+    - description: Name of the ClusterQueue where the workload is reserving quota
+      jsonPath: .status.admission.clusterQueue
+      name: Reserved in
+      type: string
+    - description: Admission status
+      jsonPath: .status.conditions[?(@.type=='Admitted')].status
+      name: Admitted
+      type: string
+    - description: Workload finished
+      jsonPath: .status.conditions[?(@.type=='Finished')].status
+      name: Finished
+      type: string
+    - description: Time this workload was created
+      jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: Workload is the Schema for the workloads API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: WorkloadSpec defines the desired state of Workload
+            properties:
+              active:
+                default: true
+                description: |-
+                  Active determines if a workload can be admitted into a queue.
+                  Changing active from true to false will evict any running workloads.
+                  Possible values are:
+
+                    - false: indicates that a workload should never be admitted and evicts running workloads
+                    - true: indicates that a workload can be evaluated for admission into it's respective queue.
+
+                  Defaults to true
+                type: boolean
+              maximumExecutionTimeSeconds:
+                description: |-
+                  maximumExecutionTimeSeconds if provided, determines the maximum time, in seconds,
+                  the workload can be admitted before it's automatically deactivated.
+
+                  If unspecified, no execution time limit is enforced on the Workload.
+                format: int32
+                minimum: 1
+                type: integer
+              podSets:
+                description: |-
+                  podSets is a list of sets of homogeneous pods, each described by a Pod spec
+                  and a count.
+                  There must be at least one element and at most 8.
+                  podSets cannot be changed.
+                items:
+                  properties:
+                    count:
+                      default: 1
+                      description: count is the number of pods for the spec.
+                      format: int32
+                      minimum: 0
+                      type: integer
+                    minCount:
+                      description: |-
+                        minCount is the minimum number of pods for the spec acceptable
+                        if the workload supports partial admission.
+
+                        If not provided, partial admission for the current PodSet is not
+                        enabled.
+
+                        Only one podSet within the workload can use this.
+
+                        This is an alpha field and requires enabling PartialAdmission feature gate.
+                      format: int32
+                      minimum: 1
+                      type: integer
+                    name:
+                      default: main
+                      description: name is the PodSet name.
+                      maxLength: 63
+                      pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
+                      type: string
+                    template:
+                      description: |-
+                        template is the Pod template.
+
+                        The only allowed fields in template.metadata are labels and annotations.
+
+                        If requests are omitted for a container or initContainer,
+                        they default to the limits if they are explicitly specified for the
+                        container or initContainer.
+
+                        During admission, the rules in nodeSelector and
+                        nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution that match
+                        the keys in the nodeLabels from the ResourceFlavors considered for this
+                        Workload are used to filter the ResourceFlavors that can be assigned to
+                        this podSet.
+                      properties:
+                        metadata:
+                          description: |-
+                            Standard object's metadata.
+                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata
+                          properties:
+                            annotations:
+                              additionalProperties:
+                                type: string
+                              type: object
+                            finalizers:
+                              items:
+                                type: string
+                              type: array
+                            labels:
+                              additionalProperties:
+                                type: string
+                              type: object
+                            name:
+                              type: string
+                            namespace:
+                              type: string
+                          type: object
+                        spec:
+                          description: |-
+                            Specification of the desired behavior of the pod.
+                            More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status
+                          properties:
+                            activeDeadlineSeconds:
+                              description: |-
+                                Optional duration in seconds the pod may be active on the node relative to
+                                StartTime before the system will actively try to mark it failed and kill associated containers.
+                                Value must be a positive integer.
+                              format: int64
+                              type: integer
+                            affinity:
+                              description: If specified, the pod's scheduling constraints
+                              properties:
+                                nodeAffinity:
+                                  description: Describes node affinity scheduling
+                                    rules for the pod.
+                                  properties:
+                                    preferredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        The scheduler will prefer to schedule pods to nodes that satisfy
+                                        the affinity expressions specified by this field, but it may choose
+                                        a node that violates one or more of the expressions. The node that is
+                                        most preferred is the one with the greatest sum of weights, i.e.
+                                        for each node that meets all of the scheduling requirements (resource
+                                        request, requiredDuringScheduling affinity expressions, etc.),
+                                        compute a sum by iterating through the elements of this field and adding
+                                        "weight" to the sum if the node matches the corresponding matchExpressions; the
+                                        node(s) with the highest sum are the most preferred.
+                                      items:
+                                        description: |-
+                                          An empty preferred scheduling term matches all objects with implicit weight 0
+                                          (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).
+                                        properties:
+                                          preference:
+                                            description: A node selector term, associated
+                                              with the corresponding weight.
+                                            properties:
+                                              matchExpressions:
+                                                description: A list of node selector
+                                                  requirements by node's labels.
+                                                items:
+                                                  description: |-
+                                                    A node selector requirement is a selector that contains values, a key, and an operator
+                                                    that relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: The label key that
+                                                        the selector applies to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        Represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        An array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. If the operator is Gt or Lt, the values
+                                                        array must have a single element, which will be interpreted as an integer.
+                                                        This array is replaced during a strategic merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchFields:
+                                                description: A list of node selector
+                                                  requirements by node's fields.
+                                                items:
+                                                  description: |-
+                                                    A node selector requirement is a selector that contains values, a key, and an operator
+                                                    that relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: The label key that
+                                                        the selector applies to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        Represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        An array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. If the operator is Gt or Lt, the values
+                                                        array must have a single element, which will be interpreted as an integer.
+                                                        This array is replaced during a strategic merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          weight:
+                                            description: Weight associated with matching
+                                              the corresponding nodeSelectorTerm,
+                                              in the range 1-100.
+                                            format: int32
+                                            type: integer
+                                        required:
+                                        - preference
+                                        - weight
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    requiredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        If the affinity requirements specified by this field are not met at
+                                        scheduling time, the pod will not be scheduled onto the node.
+                                        If the affinity requirements specified by this field cease to be met
+                                        at some point during pod execution (e.g. due to an update), the system
+                                        may or may not try to eventually evict the pod from its node.
+                                      properties:
+                                        nodeSelectorTerms:
+                                          description: Required. A list of node selector
+                                            terms. The terms are ORed.
+                                          items:
+                                            description: |-
+                                              A null or empty node selector term matches no objects. The requirements of
+                                              them are ANDed.
+                                              The TopologySelectorTerm type implements a subset of the NodeSelectorTerm.
+                                            properties:
+                                              matchExpressions:
+                                                description: A list of node selector
+                                                  requirements by node's labels.
+                                                items:
+                                                  description: |-
+                                                    A node selector requirement is a selector that contains values, a key, and an operator
+                                                    that relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: The label key that
+                                                        the selector applies to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        Represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        An array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. If the operator is Gt or Lt, the values
+                                                        array must have a single element, which will be interpreted as an integer.
+                                                        This array is replaced during a strategic merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchFields:
+                                                description: A list of node selector
+                                                  requirements by node's fields.
+                                                items:
+                                                  description: |-
+                                                    A node selector requirement is a selector that contains values, a key, and an operator
+                                                    that relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: The label key that
+                                                        the selector applies to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        Represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        An array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. If the operator is Gt or Lt, the values
+                                                        array must have a single element, which will be interpreted as an integer.
+                                                        This array is replaced during a strategic merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          type: array
+                                          x-kubernetes-list-type: atomic
+                                      required:
+                                      - nodeSelectorTerms
+                                      type: object
+                                      x-kubernetes-map-type: atomic
+                                  type: object
+                                podAffinity:
+                                  description: Describes pod affinity scheduling rules
+                                    (e.g. co-locate this pod in the same node, zone,
+                                    etc. as some other pod(s)).
+                                  properties:
+                                    preferredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        The scheduler will prefer to schedule pods to nodes that satisfy
+                                        the affinity expressions specified by this field, but it may choose
+                                        a node that violates one or more of the expressions. The node that is
+                                        most preferred is the one with the greatest sum of weights, i.e.
+                                        for each node that meets all of the scheduling requirements (resource
+                                        request, requiredDuringScheduling affinity expressions, etc.),
+                                        compute a sum by iterating through the elements of this field and adding
+                                        "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the
+                                        node(s) with the highest sum are the most preferred.
+                                      items:
+                                        description: The weights of all of the matched
+                                          WeightedPodAffinityTerm fields are added
+                                          per-node to find the most preferred node(s)
+                                        properties:
+                                          podAffinityTerm:
+                                            description: Required. A pod affinity
+                                              term, associated with the corresponding
+                                              weight.
+                                            properties:
+                                              labelSelector:
+                                                description: |-
+                                                  A label query over a set of resources, in this case pods.
+                                                  If it's null, this PodAffinityTerm matches with no Pods.
+                                                properties:
+                                                  matchExpressions:
+                                                    description: matchExpressions
+                                                      is a list of label selector
+                                                      requirements. The requirements
+                                                      are ANDed.
+                                                    items:
+                                                      description: |-
+                                                        A label selector requirement is a selector that contains values, a key, and an operator that
+                                                        relates the key and values.
+                                                      properties:
+                                                        key:
+                                                          description: key is the
+                                                            label key that the selector
+                                                            applies to.
+                                                          type: string
+                                                        operator:
+                                                          description: |-
+                                                            operator represents a key's relationship to a set of values.
+                                                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                          type: string
+                                                        values:
+                                                          description: |-
+                                                            values is an array of string values. If the operator is In or NotIn,
+                                                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                            the values array must be empty. This array is replaced during a strategic
+                                                            merge patch.
+                                                          items:
+                                                            type: string
+                                                          type: array
+                                                          x-kubernetes-list-type: atomic
+                                                      required:
+                                                      - key
+                                                      - operator
+                                                      type: object
+                                                    type: array
+                                                    x-kubernetes-list-type: atomic
+                                                  matchLabels:
+                                                    additionalProperties:
+                                                      type: string
+                                                    description: |-
+                                                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                    type: object
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              matchLabelKeys:
+                                                description: |-
+                                                  MatchLabelKeys is a set of pod label keys to select which pods will
+                                                  be taken into consideration. The keys are used to lookup values from the
+                                                  incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                                  to select the group of existing pods which pods will be taken into consideration
+                                                  for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                                  pod labels will be ignored. The default value is empty.
+                                                  The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                                  Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                                  This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              mismatchLabelKeys:
+                                                description: |-
+                                                  MismatchLabelKeys is a set of pod label keys to select which pods will
+                                                  be taken into consideration. The keys are used to lookup values from the
+                                                  incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                                  to select the group of existing pods which pods will be taken into consideration
+                                                  for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                                  pod labels will be ignored. The default value is empty.
+                                                  The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                                  Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                                  This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              namespaceSelector:
+                                                description: |-
+                                                  A label query over the set of namespaces that the term applies to.
+                                                  The term is applied to the union of the namespaces selected by this field
+                                                  and the ones listed in the namespaces field.
+                                                  null selector and null or empty namespaces list means "this pod's namespace".
+                                                  An empty selector ({}) matches all namespaces.
+                                                properties:
+                                                  matchExpressions:
+                                                    description: matchExpressions
+                                                      is a list of label selector
+                                                      requirements. The requirements
+                                                      are ANDed.
+                                                    items:
+                                                      description: |-
+                                                        A label selector requirement is a selector that contains values, a key, and an operator that
+                                                        relates the key and values.
+                                                      properties:
+                                                        key:
+                                                          description: key is the
+                                                            label key that the selector
+                                                            applies to.
+                                                          type: string
+                                                        operator:
+                                                          description: |-
+                                                            operator represents a key's relationship to a set of values.
+                                                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                          type: string
+                                                        values:
+                                                          description: |-
+                                                            values is an array of string values. If the operator is In or NotIn,
+                                                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                            the values array must be empty. This array is replaced during a strategic
+                                                            merge patch.
+                                                          items:
+                                                            type: string
+                                                          type: array
+                                                          x-kubernetes-list-type: atomic
+                                                      required:
+                                                      - key
+                                                      - operator
+                                                      type: object
+                                                    type: array
+                                                    x-kubernetes-list-type: atomic
+                                                  matchLabels:
+                                                    additionalProperties:
+                                                      type: string
+                                                    description: |-
+                                                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                    type: object
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              namespaces:
+                                                description: |-
+                                                  namespaces specifies a static list of namespace names that the term applies to.
+                                                  The term is applied to the union of the namespaces listed in this field
+                                                  and the ones selected by namespaceSelector.
+                                                  null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              topologyKey:
+                                                description: |-
+                                                  This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                                  the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                                  whose value of the label with key topologyKey matches that of any node on which any of the
+                                                  selected pods is running.
+                                                  Empty topologyKey is not allowed.
+                                                type: string
+                                            required:
+                                            - topologyKey
+                                            type: object
+                                          weight:
+                                            description: |-
+                                              weight associated with matching the corresponding podAffinityTerm,
+                                              in the range 1-100.
+                                            format: int32
+                                            type: integer
+                                        required:
+                                        - podAffinityTerm
+                                        - weight
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    requiredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        If the affinity requirements specified by this field are not met at
+                                        scheduling time, the pod will not be scheduled onto the node.
+                                        If the affinity requirements specified by this field cease to be met
+                                        at some point during pod execution (e.g. due to a pod label update), the
+                                        system may or may not try to eventually evict the pod from its node.
+                                        When there are multiple elements, the lists of nodes corresponding to each
+                                        podAffinityTerm are intersected, i.e. all terms must be satisfied.
+                                      items:
+                                        description: |-
+                                          Defines a set of pods (namely those matching the labelSelector
+                                          relative to the given namespace(s)) that this pod should be
+                                          co-located (affinity) or not co-located (anti-affinity) with,
+                                          where co-located is defined as running on a node whose value of
+                                          the label with key <topologyKey> matches that of any node on which
+                                          a pod of the set of pods is running
+                                        properties:
+                                          labelSelector:
+                                            description: |-
+                                              A label query over a set of resources, in this case pods.
+                                              If it's null, this PodAffinityTerm matches with no Pods.
+                                            properties:
+                                              matchExpressions:
+                                                description: matchExpressions is a
+                                                  list of label selector requirements.
+                                                  The requirements are ANDed.
+                                                items:
+                                                  description: |-
+                                                    A label selector requirement is a selector that contains values, a key, and an operator that
+                                                    relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: key is the label
+                                                        key that the selector applies
+                                                        to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        operator represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        values is an array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. This array is replaced during a strategic
+                                                        merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchLabels:
+                                                additionalProperties:
+                                                  type: string
+                                                description: |-
+                                                  matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                  map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                  operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                type: object
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          matchLabelKeys:
+                                            description: |-
+                                              MatchLabelKeys is a set of pod label keys to select which pods will
+                                              be taken into consideration. The keys are used to lookup values from the
+                                              incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                              to select the group of existing pods which pods will be taken into consideration
+                                              for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                              pod labels will be ignored. The default value is empty.
+                                              The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                              Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                              This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          mismatchLabelKeys:
+                                            description: |-
+                                              MismatchLabelKeys is a set of pod label keys to select which pods will
+                                              be taken into consideration. The keys are used to lookup values from the
+                                              incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                              to select the group of existing pods which pods will be taken into consideration
+                                              for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                              pod labels will be ignored. The default value is empty.
+                                              The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                              Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                              This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          namespaceSelector:
+                                            description: |-
+                                              A label query over the set of namespaces that the term applies to.
+                                              The term is applied to the union of the namespaces selected by this field
+                                              and the ones listed in the namespaces field.
+                                              null selector and null or empty namespaces list means "this pod's namespace".
+                                              An empty selector ({}) matches all namespaces.
+                                            properties:
+                                              matchExpressions:
+                                                description: matchExpressions is a
+                                                  list of label selector requirements.
+                                                  The requirements are ANDed.
+                                                items:
+                                                  description: |-
+                                                    A label selector requirement is a selector that contains values, a key, and an operator that
+                                                    relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: key is the label
+                                                        key that the selector applies
+                                                        to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        operator represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        values is an array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. This array is replaced during a strategic
+                                                        merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchLabels:
+                                                additionalProperties:
+                                                  type: string
+                                                description: |-
+                                                  matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                  map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                  operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                type: object
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          namespaces:
+                                            description: |-
+                                              namespaces specifies a static list of namespace names that the term applies to.
+                                              The term is applied to the union of the namespaces listed in this field
+                                              and the ones selected by namespaceSelector.
+                                              null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          topologyKey:
+                                            description: |-
+                                              This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                              the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                              whose value of the label with key topologyKey matches that of any node on which any of the
+                                              selected pods is running.
+                                              Empty topologyKey is not allowed.
+                                            type: string
+                                        required:
+                                        - topologyKey
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                  type: object
+                                podAntiAffinity:
+                                  description: Describes pod anti-affinity scheduling
+                                    rules (e.g. avoid putting this pod in the same
+                                    node, zone, etc. as some other pod(s)).
+                                  properties:
+                                    preferredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        The scheduler will prefer to schedule pods to nodes that satisfy
+                                        the anti-affinity expressions specified by this field, but it may choose
+                                        a node that violates one or more of the expressions. The node that is
+                                        most preferred is the one with the greatest sum of weights, i.e.
+                                        for each node that meets all of the scheduling requirements (resource
+                                        request, requiredDuringScheduling anti-affinity expressions, etc.),
+                                        compute a sum by iterating through the elements of this field and adding
+                                        "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the
+                                        node(s) with the highest sum are the most preferred.
+                                      items:
+                                        description: The weights of all of the matched
+                                          WeightedPodAffinityTerm fields are added
+                                          per-node to find the most preferred node(s)
+                                        properties:
+                                          podAffinityTerm:
+                                            description: Required. A pod affinity
+                                              term, associated with the corresponding
+                                              weight.
+                                            properties:
+                                              labelSelector:
+                                                description: |-
+                                                  A label query over a set of resources, in this case pods.
+                                                  If it's null, this PodAffinityTerm matches with no Pods.
+                                                properties:
+                                                  matchExpressions:
+                                                    description: matchExpressions
+                                                      is a list of label selector
+                                                      requirements. The requirements
+                                                      are ANDed.
+                                                    items:
+                                                      description: |-
+                                                        A label selector requirement is a selector that contains values, a key, and an operator that
+                                                        relates the key and values.
+                                                      properties:
+                                                        key:
+                                                          description: key is the
+                                                            label key that the selector
+                                                            applies to.
+                                                          type: string
+                                                        operator:
+                                                          description: |-
+                                                            operator represents a key's relationship to a set of values.
+                                                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                          type: string
+                                                        values:
+                                                          description: |-
+                                                            values is an array of string values. If the operator is In or NotIn,
+                                                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                            the values array must be empty. This array is replaced during a strategic
+                                                            merge patch.
+                                                          items:
+                                                            type: string
+                                                          type: array
+                                                          x-kubernetes-list-type: atomic
+                                                      required:
+                                                      - key
+                                                      - operator
+                                                      type: object
+                                                    type: array
+                                                    x-kubernetes-list-type: atomic
+                                                  matchLabels:
+                                                    additionalProperties:
+                                                      type: string
+                                                    description: |-
+                                                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                    type: object
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              matchLabelKeys:
+                                                description: |-
+                                                  MatchLabelKeys is a set of pod label keys to select which pods will
+                                                  be taken into consideration. The keys are used to lookup values from the
+                                                  incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                                  to select the group of existing pods which pods will be taken into consideration
+                                                  for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                                  pod labels will be ignored. The default value is empty.
+                                                  The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                                  Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                                  This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              mismatchLabelKeys:
+                                                description: |-
+                                                  MismatchLabelKeys is a set of pod label keys to select which pods will
+                                                  be taken into consideration. The keys are used to lookup values from the
+                                                  incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                                  to select the group of existing pods which pods will be taken into consideration
+                                                  for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                                  pod labels will be ignored. The default value is empty.
+                                                  The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                                  Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                                  This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              namespaceSelector:
+                                                description: |-
+                                                  A label query over the set of namespaces that the term applies to.
+                                                  The term is applied to the union of the namespaces selected by this field
+                                                  and the ones listed in the namespaces field.
+                                                  null selector and null or empty namespaces list means "this pod's namespace".
+                                                  An empty selector ({}) matches all namespaces.
+                                                properties:
+                                                  matchExpressions:
+                                                    description: matchExpressions
+                                                      is a list of label selector
+                                                      requirements. The requirements
+                                                      are ANDed.
+                                                    items:
+                                                      description: |-
+                                                        A label selector requirement is a selector that contains values, a key, and an operator that
+                                                        relates the key and values.
+                                                      properties:
+                                                        key:
+                                                          description: key is the
+                                                            label key that the selector
+                                                            applies to.
+                                                          type: string
+                                                        operator:
+                                                          description: |-
+                                                            operator represents a key's relationship to a set of values.
+                                                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                          type: string
+                                                        values:
+                                                          description: |-
+                                                            values is an array of string values. If the operator is In or NotIn,
+                                                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                            the values array must be empty. This array is replaced during a strategic
+                                                            merge patch.
+                                                          items:
+                                                            type: string
+                                                          type: array
+                                                          x-kubernetes-list-type: atomic
+                                                      required:
+                                                      - key
+                                                      - operator
+                                                      type: object
+                                                    type: array
+                                                    x-kubernetes-list-type: atomic
+                                                  matchLabels:
+                                                    additionalProperties:
+                                                      type: string
+                                                    description: |-
+                                                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                    type: object
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              namespaces:
+                                                description: |-
+                                                  namespaces specifies a static list of namespace names that the term applies to.
+                                                  The term is applied to the union of the namespaces listed in this field
+                                                  and the ones selected by namespaceSelector.
+                                                  null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              topologyKey:
+                                                description: |-
+                                                  This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                                  the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                                  whose value of the label with key topologyKey matches that of any node on which any of the
+                                                  selected pods is running.
+                                                  Empty topologyKey is not allowed.
+                                                type: string
+                                            required:
+                                            - topologyKey
+                                            type: object
+                                          weight:
+                                            description: |-
+                                              weight associated with matching the corresponding podAffinityTerm,
+                                              in the range 1-100.
+                                            format: int32
+                                            type: integer
+                                        required:
+                                        - podAffinityTerm
+                                        - weight
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    requiredDuringSchedulingIgnoredDuringExecution:
+                                      description: |-
+                                        If the anti-affinity requirements specified by this field are not met at
+                                        scheduling time, the pod will not be scheduled onto the node.
+                                        If the anti-affinity requirements specified by this field cease to be met
+                                        at some point during pod execution (e.g. due to a pod label update), the
+                                        system may or may not try to eventually evict the pod from its node.
+                                        When there are multiple elements, the lists of nodes corresponding to each
+                                        podAffinityTerm are intersected, i.e. all terms must be satisfied.
+                                      items:
+                                        description: |-
+                                          Defines a set of pods (namely those matching the labelSelector
+                                          relative to the given namespace(s)) that this pod should be
+                                          co-located (affinity) or not co-located (anti-affinity) with,
+                                          where co-located is defined as running on a node whose value of
+                                          the label with key <topologyKey> matches that of any node on which
+                                          a pod of the set of pods is running
+                                        properties:
+                                          labelSelector:
+                                            description: |-
+                                              A label query over a set of resources, in this case pods.
+                                              If it's null, this PodAffinityTerm matches with no Pods.
+                                            properties:
+                                              matchExpressions:
+                                                description: matchExpressions is a
+                                                  list of label selector requirements.
+                                                  The requirements are ANDed.
+                                                items:
+                                                  description: |-
+                                                    A label selector requirement is a selector that contains values, a key, and an operator that
+                                                    relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: key is the label
+                                                        key that the selector applies
+                                                        to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        operator represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        values is an array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. This array is replaced during a strategic
+                                                        merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchLabels:
+                                                additionalProperties:
+                                                  type: string
+                                                description: |-
+                                                  matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                  map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                  operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                type: object
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          matchLabelKeys:
+                                            description: |-
+                                              MatchLabelKeys is a set of pod label keys to select which pods will
+                                              be taken into consideration. The keys are used to lookup values from the
+                                              incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                              to select the group of existing pods which pods will be taken into consideration
+                                              for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                              pod labels will be ignored. The default value is empty.
+                                              The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                              Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                              This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          mismatchLabelKeys:
+                                            description: |-
+                                              MismatchLabelKeys is a set of pod label keys to select which pods will
+                                              be taken into consideration. The keys are used to lookup values from the
+                                              incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                              to select the group of existing pods which pods will be taken into consideration
+                                              for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                              pod labels will be ignored. The default value is empty.
+                                              The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                              Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                              This is a beta field and requires enabling MatchLabelKeysInPodAffinity feature gate (enabled by default).
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          namespaceSelector:
+                                            description: |-
+                                              A label query over the set of namespaces that the term applies to.
+                                              The term is applied to the union of the namespaces selected by this field
+                                              and the ones listed in the namespaces field.
+                                              null selector and null or empty namespaces list means "this pod's namespace".
+                                              An empty selector ({}) matches all namespaces.
+                                            properties:
+                                              matchExpressions:
+                                                description: matchExpressions is a
+                                                  list of label selector requirements.
+                                                  The requirements are ANDed.
+                                                items:
+                                                  description: |-
+                                                    A label selector requirement is a selector that contains values, a key, and an operator that
+                                                    relates the key and values.
+                                                  properties:
+                                                    key:
+                                                      description: key is the label
+                                                        key that the selector applies
+                                                        to.
+                                                      type: string
+                                                    operator:
+                                                      description: |-
+                                                        operator represents a key's relationship to a set of values.
+                                                        Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                      type: string
+                                                    values:
+                                                      description: |-
+                                                        values is an array of string values. If the operator is In or NotIn,
+                                                        the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                        the values array must be empty. This array is replaced during a strategic
+                                                        merge patch.
+                                                      items:
+                                                        type: string
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                  required:
+                                                  - key
+                                                  - operator
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              matchLabels:
+                                                additionalProperties:
+                                                  type: string
+                                                description: |-
+                                                  matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                  map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                  operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                type: object
+                                            type: object
+                                            x-kubernetes-map-type: atomic
+                                          namespaces:
+                                            description: |-
+                                              namespaces specifies a static list of namespace names that the term applies to.
+                                              The term is applied to the union of the namespaces listed in this field
+                                              and the ones selected by namespaceSelector.
+                                              null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          topologyKey:
+                                            description: |-
+                                              This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                              the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                              whose value of the label with key topologyKey matches that of any node on which any of the
+                                              selected pods is running.
+                                              Empty topologyKey is not allowed.
+                                            type: string
+                                        required:
+                                        - topologyKey
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                  type: object
+                              type: object
+                            automountServiceAccountToken:
+                              description: AutomountServiceAccountToken indicates
+                                whether a service account token should be automatically
+                                mounted.
+                              type: boolean
+                            containers:
+                              description: |-
+                                List of containers belonging to the pod.
+                                Containers cannot currently be added or removed.
+                                There must be at least one container in a Pod.
+                                Cannot be updated.
+                              items:
+                                description: A single application container that you
+                                  want to run within a pod.
+                                properties:
+                                  args:
+                                    description: |-
+                                      Arguments to the entrypoint.
+                                      The container image's CMD is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  command:
+                                    description: |-
+                                      Entrypoint array. Not executed within a shell.
+                                      The container image's ENTRYPOINT is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  env:
+                                    description: |-
+                                      List of environment variables to set in the container.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvVar represents an environment
+                                        variable present in a Container.
+                                      properties:
+                                        name:
+                                          description: Name of the environment variable.
+                                            Must be a C_IDENTIFIER.
+                                          type: string
+                                        value:
+                                          description: |-
+                                            Variable references $(VAR_NAME) are expanded
+                                            using the previously defined environment variables in the container and
+                                            any service environment variables. If a variable cannot be resolved,
+                                            the reference in the input string will be unchanged. Double $$ are reduced
+                                            to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.
+                                            "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                                            Escaped references will never be expanded, regardless of whether the variable
+                                            exists or not.
+                                            Defaults to "".
+                                          type: string
+                                        valueFrom:
+                                          description: Source for the environment
+                                            variable's value. Cannot be used if value
+                                            is not empty.
+                                          properties:
+                                            configMapKeyRef:
+                                              description: Selects a key of a ConfigMap.
+                                              properties:
+                                                key:
+                                                  description: The key to select.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    ConfigMap or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            fieldRef:
+                                              description: |-
+                                                Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['<KEY>']`, `metadata.annotations['<KEY>']`,
+                                                spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.
+                                              properties:
+                                                apiVersion:
+                                                  description: Version of the schema
+                                                    the FieldPath is written in terms
+                                                    of, defaults to "v1".
+                                                  type: string
+                                                fieldPath:
+                                                  description: Path of the field to
+                                                    select in the specified API version.
+                                                  type: string
+                                              required:
+                                              - fieldPath
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            resourceFieldRef:
+                                              description: |-
+                                                Selects a resource of the container: only resources limits and requests
+                                                (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.
+                                              properties:
+                                                containerName:
+                                                  description: 'Container name: required
+                                                    for volumes, optional for env
+                                                    vars'
+                                                  type: string
+                                                divisor:
+                                                  anyOf:
+                                                  - type: integer
+                                                  - type: string
+                                                  description: Specifies the output
+                                                    format of the exposed resources,
+                                                    defaults to "1"
+                                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                  x-kubernetes-int-or-string: true
+                                                resource:
+                                                  description: 'Required: resource
+                                                    to select'
+                                                  type: string
+                                              required:
+                                              - resource
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            secretKeyRef:
+                                              description: Selects a key of a secret
+                                                in the pod's namespace
+                                              properties:
+                                                key:
+                                                  description: The key of the secret
+                                                    to select from.  Must be a valid
+                                                    secret key.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    Secret or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                          type: object
+                                      required:
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - name
+                                    x-kubernetes-list-type: map
+                                  envFrom:
+                                    description: |-
+                                      List of sources to populate environment variables in the container.
+                                      The keys defined within a source must be a C_IDENTIFIER. All invalid keys
+                                      will be reported as an event when the container is starting. When a key exists in multiple
+                                      sources, the value associated with the last source will take precedence.
+                                      Values defined by an Env with a duplicate key will take precedence.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvFromSource represents the source
+                                        of a set of ConfigMaps
+                                      properties:
+                                        configMapRef:
+                                          description: The ConfigMap to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the ConfigMap
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                        prefix:
+                                          description: An optional identifier to prepend
+                                            to each key in the ConfigMap. Must be
+                                            a C_IDENTIFIER.
+                                          type: string
+                                        secretRef:
+                                          description: The Secret to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the Secret
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  image:
+                                    description: |-
+                                      Container image name.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images
+                                      This field is optional to allow higher level config management to default or override
+                                      container images in workload controllers like Deployments and StatefulSets.
+                                    type: string
+                                  imagePullPolicy:
+                                    description: |-
+                                      Image pull policy.
+                                      One of Always, Never, IfNotPresent.
+                                      Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images#updating-images
+                                    type: string
+                                  lifecycle:
+                                    description: |-
+                                      Actions that the management system should take in response to container lifecycle events.
+                                      Cannot be updated.
+                                    properties:
+                                      postStart:
+                                        description: |-
+                                          PostStart is called immediately after a container is created. If the handler fails,
+                                          the container is terminated and restarted according to its restart policy.
+                                          Other management of the container blocks until the hook completes.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                      preStop:
+                                        description: |-
+                                          PreStop is called immediately before a container is terminated due to an
+                                          API request or management event such as liveness/startup probe failure,
+                                          preemption, resource contention, etc. The handler is not called if the
+                                          container crashes or exits. The Pod's termination grace period countdown begins before the
+                                          PreStop hook is executed. Regardless of the outcome of the handler, the
+                                          container will eventually terminate within the Pod's termination grace
+                                          period (unless delayed by finalizers). Other management of the container blocks until the hook completes
+                                          or until the termination grace period is reached.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                    type: object
+                                  livenessProbe:
+                                    description: |-
+                                      Periodic probe of container liveness.
+                                      Container will be restarted if the probe fails.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  name:
+                                    description: |-
+                                      Name of the container specified as a DNS_LABEL.
+                                      Each container in a pod must have a unique name (DNS_LABEL).
+                                      Cannot be updated.
+                                    type: string
+                                  ports:
+                                    description: |-
+                                      List of ports to expose from the container. Not specifying a port here
+                                      DOES NOT prevent that port from being exposed. Any port which is
+                                      listening on the default "0.0.0.0" address inside a container will be
+                                      accessible from the network.
+                                      Modifying this array with strategic merge patch may corrupt the data.
+                                      For more information See https://github.com/kubernetes/kubernetes/issues/108255.
+                                      Cannot be updated.
+                                    items:
+                                      description: ContainerPort represents a network
+                                        port in a single container.
+                                      properties:
+                                        containerPort:
+                                          description: |-
+                                            Number of port to expose on the pod's IP address.
+                                            This must be a valid port number, 0 < x < 65536.
+                                          format: int32
+                                          type: integer
+                                        hostIP:
+                                          description: What host IP to bind the external
+                                            port to.
+                                          type: string
+                                        hostPort:
+                                          description: |-
+                                            Number of port to expose on the host.
+                                            If specified, this must be a valid port number, 0 < x < 65536.
+                                            If HostNetwork is specified, this must match ContainerPort.
+                                            Most containers do not need this.
+                                          format: int32
+                                          type: integer
+                                        name:
+                                          description: |-
+                                            If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
+                                            named port in a pod must have a unique name. Name for the port that can be
+                                            referred to by services.
+                                          type: string
+                                        protocol:
+                                          default: TCP
+                                          description: |-
+                                            Protocol for port. Must be UDP, TCP, or SCTP.
+                                            Defaults to "TCP".
+                                          type: string
+                                      required:
+                                      - containerPort
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - containerPort
+                                    - protocol
+                                    x-kubernetes-list-type: map
+                                  readinessProbe:
+                                    description: |-
+                                      Periodic probe of container service readiness.
+                                      Container will be removed from service endpoints if the probe fails.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  resizePolicy:
+                                    description: Resources resize policy for the container.
+                                    items:
+                                      description: ContainerResizePolicy represents
+                                        resource resize policy for the container.
+                                      properties:
+                                        resourceName:
+                                          description: |-
+                                            Name of the resource to which this resource resize policy applies.
+                                            Supported values: cpu, memory.
+                                          type: string
+                                        restartPolicy:
+                                          description: |-
+                                            Restart policy to apply when specified resource is resized.
+                                            If not specified, it defaults to NotRequired.
+                                          type: string
+                                      required:
+                                      - resourceName
+                                      - restartPolicy
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  resources:
+                                    description: |-
+                                      Compute Resources required by this container.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                    properties:
+                                      claims:
+                                        description: |-
+                                          Claims lists the names of resources, defined in spec.resourceClaims,
+                                          that are used by this container.
+
+                                          This is an alpha field and requires enabling the
+                                          DynamicResourceAllocation feature gate.
+
+                                          This field is immutable. It can only be set for containers.
+                                        items:
+                                          description: ResourceClaim references one
+                                            entry in PodSpec.ResourceClaims.
+                                          properties:
+                                            name:
+                                              description: |-
+                                                Name must match the name of one entry in pod.spec.resourceClaims of
+                                                the Pod where this field is used. It makes that resource available
+                                                inside a container.
+                                              type: string
+                                            request:
+                                              description: |-
+                                                Request is the name chosen for a request in the referenced claim.
+                                                If empty, everything from the claim is made available, otherwise
+                                                only the result of this request.
+                                              type: string
+                                          required:
+                                          - name
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-map-keys:
+                                        - name
+                                        x-kubernetes-list-type: map
+                                      limits:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Limits describes the maximum amount of compute resources allowed.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                      requests:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Requests describes the minimum amount of compute resources required.
+                                          If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                          otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                    type: object
+                                  restartPolicy:
+                                    description: |-
+                                      RestartPolicy defines the restart behavior of individual containers in a pod.
+                                      This field may only be set for init containers, and the only allowed value is "Always".
+                                      For non-init containers or when this field is not specified,
+                                      the restart behavior is defined by the Pod's restart policy and the container type.
+                                      Setting the RestartPolicy as "Always" for the init container will have the following effect:
+                                      this init container will be continually restarted on
+                                      exit until all regular containers have terminated. Once all regular
+                                      containers have completed, all init containers with restartPolicy "Always"
+                                      will be shut down. This lifecycle differs from normal init containers and
+                                      is often referred to as a "sidecar" container. Although this init
+                                      container still starts in the init container sequence, it does not wait
+                                      for the container to complete before proceeding to the next init
+                                      container. Instead, the next init container starts immediately after this
+                                      init container is started, or after any startupProbe has successfully
+                                      completed.
+                                    type: string
+                                  securityContext:
+                                    description: |-
+                                      SecurityContext defines the security options the container should be run with.
+                                      If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext.
+                                      More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
+                                    properties:
+                                      allowPrivilegeEscalation:
+                                        description: |-
+                                          AllowPrivilegeEscalation controls whether a process can gain more
+                                          privileges than its parent process. This bool directly controls if
+                                          the no_new_privs flag will be set on the container process.
+                                          AllowPrivilegeEscalation is true always when the container is:
+                                          1) run as Privileged
+                                          2) has CAP_SYS_ADMIN
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      appArmorProfile:
+                                        description: |-
+                                          appArmorProfile is the AppArmor options to use by this container. If set, this profile
+                                          overrides the pod's appArmorProfile.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile loaded on the node that should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must match the loaded name of the profile.
+                                              Must be set if and only if type is "Localhost".
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of AppArmor profile will be applied.
+                                              Valid options are:
+                                                Localhost - a profile pre-loaded on the node.
+                                                RuntimeDefault - the container runtime's default profile.
+                                                Unconfined - no AppArmor enforcement.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      capabilities:
+                                        description: |-
+                                          The capabilities to add/drop when running containers.
+                                          Defaults to the default set of capabilities granted by the container runtime.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          add:
+                                            description: Added capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          drop:
+                                            description: Removed capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      privileged:
+                                        description: |-
+                                          Run container in privileged mode.
+                                          Processes in privileged containers are essentially equivalent to root on the host.
+                                          Defaults to false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      procMount:
+                                        description: |-
+                                          procMount denotes the type of proc mount to use for the containers.
+                                          The default value is Default which uses the container runtime defaults for
+                                          readonly paths and masked paths.
+                                          This requires the ProcMountType feature flag to be enabled.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: string
+                                      readOnlyRootFilesystem:
+                                        description: |-
+                                          Whether this container has a read-only root filesystem.
+                                          Default is false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      runAsGroup:
+                                        description: |-
+                                          The GID to run the entrypoint of the container process.
+                                          Uses runtime default if unset.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      runAsNonRoot:
+                                        description: |-
+                                          Indicates that the container must run as a non-root user.
+                                          If true, the Kubelet will validate the image at runtime to ensure that it
+                                          does not run as UID 0 (root) and fail to start the container if it does.
+                                          If unset or false, no such validation will be performed.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                        type: boolean
+                                      runAsUser:
+                                        description: |-
+                                          The UID to run the entrypoint of the container process.
+                                          Defaults to user specified in image metadata if unspecified.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      seLinuxOptions:
+                                        description: |-
+                                          The SELinux context to be applied to the container.
+                                          If unspecified, the container runtime will allocate a random SELinux context for each
+                                          container.  May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          level:
+                                            description: Level is SELinux level label
+                                              that applies to the container.
+                                            type: string
+                                          role:
+                                            description: Role is a SELinux role label
+                                              that applies to the container.
+                                            type: string
+                                          type:
+                                            description: Type is a SELinux type label
+                                              that applies to the container.
+                                            type: string
+                                          user:
+                                            description: User is a SELinux user label
+                                              that applies to the container.
+                                            type: string
+                                        type: object
+                                      seccompProfile:
+                                        description: |-
+                                          The seccomp options to use by this container. If seccomp options are
+                                          provided at both the pod & container level, the container options
+                                          override the pod options.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile defined in a file on the node should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must be a descending path, relative to the kubelet's configured seccomp profile location.
+                                              Must be set if type is "Localhost". Must NOT be set for any other type.
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of seccomp profile will be applied.
+                                              Valid options are:
+
+                                              Localhost - a profile defined in a file on the node should be used.
+                                              RuntimeDefault - the container runtime default profile should be used.
+                                              Unconfined - no profile should be applied.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      windowsOptions:
+                                        description: |-
+                                          The Windows specific settings applied to all containers.
+                                          If unspecified, the options from the PodSecurityContext will be used.
+                                          If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is linux.
+                                        properties:
+                                          gmsaCredentialSpec:
+                                            description: |-
+                                              GMSACredentialSpec is where the GMSA admission webhook
+                                              (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the
+                                              GMSA credential spec named by the GMSACredentialSpecName field.
+                                            type: string
+                                          gmsaCredentialSpecName:
+                                            description: GMSACredentialSpecName is
+                                              the name of the GMSA credential spec
+                                              to use.
+                                            type: string
+                                          hostProcess:
+                                            description: |-
+                                              HostProcess determines if a container should be run as a 'Host Process' container.
+                                              All of a Pod's containers must have the same effective HostProcess value
+                                              (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).
+                                              In addition, if HostProcess is true then HostNetwork must also be set to true.
+                                            type: boolean
+                                          runAsUserName:
+                                            description: |-
+                                              The UserName in Windows to run the entrypoint of the container process.
+                                              Defaults to the user specified in image metadata if unspecified.
+                                              May also be set in PodSecurityContext. If set in both SecurityContext and
+                                              PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                            type: string
+                                        type: object
+                                    type: object
+                                  startupProbe:
+                                    description: |-
+                                      StartupProbe indicates that the Pod has successfully initialized.
+                                      If specified, no other probes are executed until this completes successfully.
+                                      If this probe fails, the Pod will be restarted, just as if the livenessProbe failed.
+                                      This can be used to provide different probe parameters at the beginning of a Pod's lifecycle,
+                                      when it might take a long time to load data or warm a cache, than during steady-state operation.
+                                      This cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  stdin:
+                                    description: |-
+                                      Whether this container should allocate a buffer for stdin in the container runtime. If this
+                                      is not set, reads from stdin in the container will always result in EOF.
+                                      Default is false.
+                                    type: boolean
+                                  stdinOnce:
+                                    description: |-
+                                      Whether the container runtime should close the stdin channel after it has been opened by
+                                      a single attach. When stdin is true the stdin stream will remain open across multiple attach
+                                      sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the
+                                      first client attaches to stdin, and then remains open and accepts data until the client disconnects,
+                                      at which time stdin is closed and remains closed until the container is restarted. If this
+                                      flag is false, a container processes that reads from stdin will never receive an EOF.
+                                      Default is false
+                                    type: boolean
+                                  terminationMessagePath:
+                                    description: |-
+                                      Optional: Path at which the file to which the container's termination message
+                                      will be written is mounted into the container's filesystem.
+                                      Message written is intended to be brief final status, such as an assertion failure message.
+                                      Will be truncated by the node if greater than 4096 bytes. The total message length across
+                                      all containers will be limited to 12kb.
+                                      Defaults to /dev/termination-log.
+                                      Cannot be updated.
+                                    type: string
+                                  terminationMessagePolicy:
+                                    description: |-
+                                      Indicate how the termination message should be populated. File will use the contents of
+                                      terminationMessagePath to populate the container status message on both success and failure.
+                                      FallbackToLogsOnError will use the last chunk of container log output if the termination
+                                      message file is empty and the container exited with an error.
+                                      The log output is limited to 2048 bytes or 80 lines, whichever is smaller.
+                                      Defaults to File.
+                                      Cannot be updated.
+                                    type: string
+                                  tty:
+                                    description: |-
+                                      Whether this container should allocate a TTY for itself, also requires 'stdin' to be true.
+                                      Default is false.
+                                    type: boolean
+                                  volumeDevices:
+                                    description: volumeDevices is the list of block
+                                      devices to be used by the container.
+                                    items:
+                                      description: volumeDevice describes a mapping
+                                        of a raw block device within a container.
+                                      properties:
+                                        devicePath:
+                                          description: devicePath is the path inside
+                                            of the container that the device will
+                                            be mapped to.
+                                          type: string
+                                        name:
+                                          description: name must match the name of
+                                            a persistentVolumeClaim in the pod
+                                          type: string
+                                      required:
+                                      - devicePath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - devicePath
+                                    x-kubernetes-list-type: map
+                                  volumeMounts:
+                                    description: |-
+                                      Pod volumes to mount into the container's filesystem.
+                                      Cannot be updated.
+                                    items:
+                                      description: VolumeMount describes a mounting
+                                        of a Volume within a container.
+                                      properties:
+                                        mountPath:
+                                          description: |-
+                                            Path within the container at which the volume should be mounted.  Must
+                                            not contain ':'.
+                                          type: string
+                                        mountPropagation:
+                                          description: |-
+                                            mountPropagation determines how mounts are propagated from the host
+                                            to container and the other way around.
+                                            When not set, MountPropagationNone is used.
+                                            This field is beta in 1.10.
+                                            When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified
+                                            (which defaults to None).
+                                          type: string
+                                        name:
+                                          description: This must match the Name of
+                                            a Volume.
+                                          type: string
+                                        readOnly:
+                                          description: |-
+                                            Mounted read-only if true, read-write otherwise (false or unspecified).
+                                            Defaults to false.
+                                          type: boolean
+                                        recursiveReadOnly:
+                                          description: |-
+                                            RecursiveReadOnly specifies whether read-only mounts should be handled
+                                            recursively.
+
+                                            If ReadOnly is false, this field has no meaning and must be unspecified.
+
+                                            If ReadOnly is true, and this field is set to Disabled, the mount is not made
+                                            recursively read-only.  If this field is set to IfPossible, the mount is made
+                                            recursively read-only, if it is supported by the container runtime.  If this
+                                            field is set to Enabled, the mount is made recursively read-only if it is
+                                            supported by the container runtime, otherwise the pod will not be started and
+                                            an error will be generated to indicate the reason.
+
+                                            If this field is set to IfPossible or Enabled, MountPropagation must be set to
+                                            None (or be unspecified, which defaults to None).
+
+                                            If this field is not specified, it is treated as an equivalent of Disabled.
+                                          type: string
+                                        subPath:
+                                          description: |-
+                                            Path within the volume from which the container's volume should be mounted.
+                                            Defaults to "" (volume's root).
+                                          type: string
+                                        subPathExpr:
+                                          description: |-
+                                            Expanded path within the volume from which the container's volume should be mounted.
+                                            Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment.
+                                            Defaults to "" (volume's root).
+                                            SubPathExpr and SubPath are mutually exclusive.
+                                          type: string
+                                      required:
+                                      - mountPath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - mountPath
+                                    x-kubernetes-list-type: map
+                                  workingDir:
+                                    description: |-
+                                      Container's working directory.
+                                      If not specified, the container runtime's default will be used, which
+                                      might be configured in the container image.
+                                      Cannot be updated.
+                                    type: string
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            dnsConfig:
+                              description: |-
+                                Specifies the DNS parameters of a pod.
+                                Parameters specified here will be merged to the generated DNS
+                                configuration based on DNSPolicy.
+                              properties:
+                                nameservers:
+                                  description: |-
+                                    A list of DNS name server IP addresses.
+                                    This will be appended to the base nameservers generated from DNSPolicy.
+                                    Duplicated nameservers will be removed.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                options:
+                                  description: |-
+                                    A list of DNS resolver options.
+                                    This will be merged with the base options generated from DNSPolicy.
+                                    Duplicated entries will be removed. Resolution options given in Options
+                                    will override those that appear in the base DNSPolicy.
+                                  items:
+                                    description: PodDNSConfigOption defines DNS resolver
+                                      options of a pod.
+                                    properties:
+                                      name:
+                                        description: Required.
+                                        type: string
+                                      value:
+                                        type: string
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                searches:
+                                  description: |-
+                                    A list of DNS search domains for host-name lookup.
+                                    This will be appended to the base search paths generated from DNSPolicy.
+                                    Duplicated search paths will be removed.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                              type: object
+                            dnsPolicy:
+                              description: |-
+                                Set DNS policy for the pod.
+                                Defaults to "ClusterFirst".
+                                Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'.
+                                DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy.
+                                To have DNS options set along with hostNetwork, you have to specify DNS policy
+                                explicitly to 'ClusterFirstWithHostNet'.
+                              type: string
+                            enableServiceLinks:
+                              description: |-
+                                EnableServiceLinks indicates whether information about services should be injected into pod's
+                                environment variables, matching the syntax of Docker links.
+                                Optional: Defaults to true.
+                              type: boolean
+                            ephemeralContainers:
+                              description: |-
+                                List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing
+                                pod to perform user-initiated actions such as debugging. This list cannot be specified when
+                                creating a pod, and it cannot be modified by updating the pod spec. In order to add an
+                                ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource.
+                              items:
+                                description: |-
+                                  An EphemeralContainer is a temporary container that you may add to an existing Pod for
+                                  user-initiated activities such as debugging. Ephemeral containers have no resource or
+                                  scheduling guarantees, and they will not be restarted when they exit or when a Pod is
+                                  removed or restarted. The kubelet may evict a Pod if an ephemeral container causes the
+                                  Pod to exceed its resource allocation.
+
+                                  To add an ephemeral container, use the ephemeralcontainers subresource of an existing
+                                  Pod. Ephemeral containers may not be removed or restarted.
+                                properties:
+                                  args:
+                                    description: |-
+                                      Arguments to the entrypoint.
+                                      The image's CMD is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  command:
+                                    description: |-
+                                      Entrypoint array. Not executed within a shell.
+                                      The image's ENTRYPOINT is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  env:
+                                    description: |-
+                                      List of environment variables to set in the container.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvVar represents an environment
+                                        variable present in a Container.
+                                      properties:
+                                        name:
+                                          description: Name of the environment variable.
+                                            Must be a C_IDENTIFIER.
+                                          type: string
+                                        value:
+                                          description: |-
+                                            Variable references $(VAR_NAME) are expanded
+                                            using the previously defined environment variables in the container and
+                                            any service environment variables. If a variable cannot be resolved,
+                                            the reference in the input string will be unchanged. Double $$ are reduced
+                                            to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.
+                                            "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                                            Escaped references will never be expanded, regardless of whether the variable
+                                            exists or not.
+                                            Defaults to "".
+                                          type: string
+                                        valueFrom:
+                                          description: Source for the environment
+                                            variable's value. Cannot be used if value
+                                            is not empty.
+                                          properties:
+                                            configMapKeyRef:
+                                              description: Selects a key of a ConfigMap.
+                                              properties:
+                                                key:
+                                                  description: The key to select.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    ConfigMap or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            fieldRef:
+                                              description: |-
+                                                Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['<KEY>']`, `metadata.annotations['<KEY>']`,
+                                                spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.
+                                              properties:
+                                                apiVersion:
+                                                  description: Version of the schema
+                                                    the FieldPath is written in terms
+                                                    of, defaults to "v1".
+                                                  type: string
+                                                fieldPath:
+                                                  description: Path of the field to
+                                                    select in the specified API version.
+                                                  type: string
+                                              required:
+                                              - fieldPath
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            resourceFieldRef:
+                                              description: |-
+                                                Selects a resource of the container: only resources limits and requests
+                                                (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.
+                                              properties:
+                                                containerName:
+                                                  description: 'Container name: required
+                                                    for volumes, optional for env
+                                                    vars'
+                                                  type: string
+                                                divisor:
+                                                  anyOf:
+                                                  - type: integer
+                                                  - type: string
+                                                  description: Specifies the output
+                                                    format of the exposed resources,
+                                                    defaults to "1"
+                                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                  x-kubernetes-int-or-string: true
+                                                resource:
+                                                  description: 'Required: resource
+                                                    to select'
+                                                  type: string
+                                              required:
+                                              - resource
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            secretKeyRef:
+                                              description: Selects a key of a secret
+                                                in the pod's namespace
+                                              properties:
+                                                key:
+                                                  description: The key of the secret
+                                                    to select from.  Must be a valid
+                                                    secret key.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    Secret or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                          type: object
+                                      required:
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - name
+                                    x-kubernetes-list-type: map
+                                  envFrom:
+                                    description: |-
+                                      List of sources to populate environment variables in the container.
+                                      The keys defined within a source must be a C_IDENTIFIER. All invalid keys
+                                      will be reported as an event when the container is starting. When a key exists in multiple
+                                      sources, the value associated with the last source will take precedence.
+                                      Values defined by an Env with a duplicate key will take precedence.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvFromSource represents the source
+                                        of a set of ConfigMaps
+                                      properties:
+                                        configMapRef:
+                                          description: The ConfigMap to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the ConfigMap
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                        prefix:
+                                          description: An optional identifier to prepend
+                                            to each key in the ConfigMap. Must be
+                                            a C_IDENTIFIER.
+                                          type: string
+                                        secretRef:
+                                          description: The Secret to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the Secret
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  image:
+                                    description: |-
+                                      Container image name.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images
+                                    type: string
+                                  imagePullPolicy:
+                                    description: |-
+                                      Image pull policy.
+                                      One of Always, Never, IfNotPresent.
+                                      Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images#updating-images
+                                    type: string
+                                  lifecycle:
+                                    description: Lifecycle is not allowed for ephemeral
+                                      containers.
+                                    properties:
+                                      postStart:
+                                        description: |-
+                                          PostStart is called immediately after a container is created. If the handler fails,
+                                          the container is terminated and restarted according to its restart policy.
+                                          Other management of the container blocks until the hook completes.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                      preStop:
+                                        description: |-
+                                          PreStop is called immediately before a container is terminated due to an
+                                          API request or management event such as liveness/startup probe failure,
+                                          preemption, resource contention, etc. The handler is not called if the
+                                          container crashes or exits. The Pod's termination grace period countdown begins before the
+                                          PreStop hook is executed. Regardless of the outcome of the handler, the
+                                          container will eventually terminate within the Pod's termination grace
+                                          period (unless delayed by finalizers). Other management of the container blocks until the hook completes
+                                          or until the termination grace period is reached.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                    type: object
+                                  livenessProbe:
+                                    description: Probes are not allowed for ephemeral
+                                      containers.
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  name:
+                                    description: |-
+                                      Name of the ephemeral container specified as a DNS_LABEL.
+                                      This name must be unique among all containers, init containers and ephemeral containers.
+                                    type: string
+                                  ports:
+                                    description: Ports are not allowed for ephemeral
+                                      containers.
+                                    items:
+                                      description: ContainerPort represents a network
+                                        port in a single container.
+                                      properties:
+                                        containerPort:
+                                          description: |-
+                                            Number of port to expose on the pod's IP address.
+                                            This must be a valid port number, 0 < x < 65536.
+                                          format: int32
+                                          type: integer
+                                        hostIP:
+                                          description: What host IP to bind the external
+                                            port to.
+                                          type: string
+                                        hostPort:
+                                          description: |-
+                                            Number of port to expose on the host.
+                                            If specified, this must be a valid port number, 0 < x < 65536.
+                                            If HostNetwork is specified, this must match ContainerPort.
+                                            Most containers do not need this.
+                                          format: int32
+                                          type: integer
+                                        name:
+                                          description: |-
+                                            If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
+                                            named port in a pod must have a unique name. Name for the port that can be
+                                            referred to by services.
+                                          type: string
+                                        protocol:
+                                          default: TCP
+                                          description: |-
+                                            Protocol for port. Must be UDP, TCP, or SCTP.
+                                            Defaults to "TCP".
+                                          type: string
+                                      required:
+                                      - containerPort
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - containerPort
+                                    - protocol
+                                    x-kubernetes-list-type: map
+                                  readinessProbe:
+                                    description: Probes are not allowed for ephemeral
+                                      containers.
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  resizePolicy:
+                                    description: Resources resize policy for the container.
+                                    items:
+                                      description: ContainerResizePolicy represents
+                                        resource resize policy for the container.
+                                      properties:
+                                        resourceName:
+                                          description: |-
+                                            Name of the resource to which this resource resize policy applies.
+                                            Supported values: cpu, memory.
+                                          type: string
+                                        restartPolicy:
+                                          description: |-
+                                            Restart policy to apply when specified resource is resized.
+                                            If not specified, it defaults to NotRequired.
+                                          type: string
+                                      required:
+                                      - resourceName
+                                      - restartPolicy
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  resources:
+                                    description: |-
+                                      Resources are not allowed for ephemeral containers. Ephemeral containers use spare resources
+                                      already allocated to the pod.
+                                    properties:
+                                      claims:
+                                        description: |-
+                                          Claims lists the names of resources, defined in spec.resourceClaims,
+                                          that are used by this container.
+
+                                          This is an alpha field and requires enabling the
+                                          DynamicResourceAllocation feature gate.
+
+                                          This field is immutable. It can only be set for containers.
+                                        items:
+                                          description: ResourceClaim references one
+                                            entry in PodSpec.ResourceClaims.
+                                          properties:
+                                            name:
+                                              description: |-
+                                                Name must match the name of one entry in pod.spec.resourceClaims of
+                                                the Pod where this field is used. It makes that resource available
+                                                inside a container.
+                                              type: string
+                                            request:
+                                              description: |-
+                                                Request is the name chosen for a request in the referenced claim.
+                                                If empty, everything from the claim is made available, otherwise
+                                                only the result of this request.
+                                              type: string
+                                          required:
+                                          - name
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-map-keys:
+                                        - name
+                                        x-kubernetes-list-type: map
+                                      limits:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Limits describes the maximum amount of compute resources allowed.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                      requests:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Requests describes the minimum amount of compute resources required.
+                                          If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                          otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                    type: object
+                                  restartPolicy:
+                                    description: |-
+                                      Restart policy for the container to manage the restart behavior of each
+                                      container within a pod.
+                                      This may only be set for init containers. You cannot set this field on
+                                      ephemeral containers.
+                                    type: string
+                                  securityContext:
+                                    description: |-
+                                      Optional: SecurityContext defines the security options the ephemeral container should be run with.
+                                      If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext.
+                                    properties:
+                                      allowPrivilegeEscalation:
+                                        description: |-
+                                          AllowPrivilegeEscalation controls whether a process can gain more
+                                          privileges than its parent process. This bool directly controls if
+                                          the no_new_privs flag will be set on the container process.
+                                          AllowPrivilegeEscalation is true always when the container is:
+                                          1) run as Privileged
+                                          2) has CAP_SYS_ADMIN
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      appArmorProfile:
+                                        description: |-
+                                          appArmorProfile is the AppArmor options to use by this container. If set, this profile
+                                          overrides the pod's appArmorProfile.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile loaded on the node that should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must match the loaded name of the profile.
+                                              Must be set if and only if type is "Localhost".
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of AppArmor profile will be applied.
+                                              Valid options are:
+                                                Localhost - a profile pre-loaded on the node.
+                                                RuntimeDefault - the container runtime's default profile.
+                                                Unconfined - no AppArmor enforcement.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      capabilities:
+                                        description: |-
+                                          The capabilities to add/drop when running containers.
+                                          Defaults to the default set of capabilities granted by the container runtime.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          add:
+                                            description: Added capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          drop:
+                                            description: Removed capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      privileged:
+                                        description: |-
+                                          Run container in privileged mode.
+                                          Processes in privileged containers are essentially equivalent to root on the host.
+                                          Defaults to false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      procMount:
+                                        description: |-
+                                          procMount denotes the type of proc mount to use for the containers.
+                                          The default value is Default which uses the container runtime defaults for
+                                          readonly paths and masked paths.
+                                          This requires the ProcMountType feature flag to be enabled.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: string
+                                      readOnlyRootFilesystem:
+                                        description: |-
+                                          Whether this container has a read-only root filesystem.
+                                          Default is false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      runAsGroup:
+                                        description: |-
+                                          The GID to run the entrypoint of the container process.
+                                          Uses runtime default if unset.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      runAsNonRoot:
+                                        description: |-
+                                          Indicates that the container must run as a non-root user.
+                                          If true, the Kubelet will validate the image at runtime to ensure that it
+                                          does not run as UID 0 (root) and fail to start the container if it does.
+                                          If unset or false, no such validation will be performed.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                        type: boolean
+                                      runAsUser:
+                                        description: |-
+                                          The UID to run the entrypoint of the container process.
+                                          Defaults to user specified in image metadata if unspecified.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      seLinuxOptions:
+                                        description: |-
+                                          The SELinux context to be applied to the container.
+                                          If unspecified, the container runtime will allocate a random SELinux context for each
+                                          container.  May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          level:
+                                            description: Level is SELinux level label
+                                              that applies to the container.
+                                            type: string
+                                          role:
+                                            description: Role is a SELinux role label
+                                              that applies to the container.
+                                            type: string
+                                          type:
+                                            description: Type is a SELinux type label
+                                              that applies to the container.
+                                            type: string
+                                          user:
+                                            description: User is a SELinux user label
+                                              that applies to the container.
+                                            type: string
+                                        type: object
+                                      seccompProfile:
+                                        description: |-
+                                          The seccomp options to use by this container. If seccomp options are
+                                          provided at both the pod & container level, the container options
+                                          override the pod options.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile defined in a file on the node should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must be a descending path, relative to the kubelet's configured seccomp profile location.
+                                              Must be set if type is "Localhost". Must NOT be set for any other type.
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of seccomp profile will be applied.
+                                              Valid options are:
+
+                                              Localhost - a profile defined in a file on the node should be used.
+                                              RuntimeDefault - the container runtime default profile should be used.
+                                              Unconfined - no profile should be applied.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      windowsOptions:
+                                        description: |-
+                                          The Windows specific settings applied to all containers.
+                                          If unspecified, the options from the PodSecurityContext will be used.
+                                          If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is linux.
+                                        properties:
+                                          gmsaCredentialSpec:
+                                            description: |-
+                                              GMSACredentialSpec is where the GMSA admission webhook
+                                              (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the
+                                              GMSA credential spec named by the GMSACredentialSpecName field.
+                                            type: string
+                                          gmsaCredentialSpecName:
+                                            description: GMSACredentialSpecName is
+                                              the name of the GMSA credential spec
+                                              to use.
+                                            type: string
+                                          hostProcess:
+                                            description: |-
+                                              HostProcess determines if a container should be run as a 'Host Process' container.
+                                              All of a Pod's containers must have the same effective HostProcess value
+                                              (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).
+                                              In addition, if HostProcess is true then HostNetwork must also be set to true.
+                                            type: boolean
+                                          runAsUserName:
+                                            description: |-
+                                              The UserName in Windows to run the entrypoint of the container process.
+                                              Defaults to the user specified in image metadata if unspecified.
+                                              May also be set in PodSecurityContext. If set in both SecurityContext and
+                                              PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                            type: string
+                                        type: object
+                                    type: object
+                                  startupProbe:
+                                    description: Probes are not allowed for ephemeral
+                                      containers.
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  stdin:
+                                    description: |-
+                                      Whether this container should allocate a buffer for stdin in the container runtime. If this
+                                      is not set, reads from stdin in the container will always result in EOF.
+                                      Default is false.
+                                    type: boolean
+                                  stdinOnce:
+                                    description: |-
+                                      Whether the container runtime should close the stdin channel after it has been opened by
+                                      a single attach. When stdin is true the stdin stream will remain open across multiple attach
+                                      sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the
+                                      first client attaches to stdin, and then remains open and accepts data until the client disconnects,
+                                      at which time stdin is closed and remains closed until the container is restarted. If this
+                                      flag is false, a container processes that reads from stdin will never receive an EOF.
+                                      Default is false
+                                    type: boolean
+                                  targetContainerName:
+                                    description: |-
+                                      If set, the name of the container from PodSpec that this ephemeral container targets.
+                                      The ephemeral container will be run in the namespaces (IPC, PID, etc) of this container.
+                                      If not set then the ephemeral container uses the namespaces configured in the Pod spec.
+
+                                      The container runtime must implement support for this feature. If the runtime does not
+                                      support namespace targeting then the result of setting this field is undefined.
+                                    type: string
+                                  terminationMessagePath:
+                                    description: |-
+                                      Optional: Path at which the file to which the container's termination message
+                                      will be written is mounted into the container's filesystem.
+                                      Message written is intended to be brief final status, such as an assertion failure message.
+                                      Will be truncated by the node if greater than 4096 bytes. The total message length across
+                                      all containers will be limited to 12kb.
+                                      Defaults to /dev/termination-log.
+                                      Cannot be updated.
+                                    type: string
+                                  terminationMessagePolicy:
+                                    description: |-
+                                      Indicate how the termination message should be populated. File will use the contents of
+                                      terminationMessagePath to populate the container status message on both success and failure.
+                                      FallbackToLogsOnError will use the last chunk of container log output if the termination
+                                      message file is empty and the container exited with an error.
+                                      The log output is limited to 2048 bytes or 80 lines, whichever is smaller.
+                                      Defaults to File.
+                                      Cannot be updated.
+                                    type: string
+                                  tty:
+                                    description: |-
+                                      Whether this container should allocate a TTY for itself, also requires 'stdin' to be true.
+                                      Default is false.
+                                    type: boolean
+                                  volumeDevices:
+                                    description: volumeDevices is the list of block
+                                      devices to be used by the container.
+                                    items:
+                                      description: volumeDevice describes a mapping
+                                        of a raw block device within a container.
+                                      properties:
+                                        devicePath:
+                                          description: devicePath is the path inside
+                                            of the container that the device will
+                                            be mapped to.
+                                          type: string
+                                        name:
+                                          description: name must match the name of
+                                            a persistentVolumeClaim in the pod
+                                          type: string
+                                      required:
+                                      - devicePath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - devicePath
+                                    x-kubernetes-list-type: map
+                                  volumeMounts:
+                                    description: |-
+                                      Pod volumes to mount into the container's filesystem. Subpath mounts are not allowed for ephemeral containers.
+                                      Cannot be updated.
+                                    items:
+                                      description: VolumeMount describes a mounting
+                                        of a Volume within a container.
+                                      properties:
+                                        mountPath:
+                                          description: |-
+                                            Path within the container at which the volume should be mounted.  Must
+                                            not contain ':'.
+                                          type: string
+                                        mountPropagation:
+                                          description: |-
+                                            mountPropagation determines how mounts are propagated from the host
+                                            to container and the other way around.
+                                            When not set, MountPropagationNone is used.
+                                            This field is beta in 1.10.
+                                            When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified
+                                            (which defaults to None).
+                                          type: string
+                                        name:
+                                          description: This must match the Name of
+                                            a Volume.
+                                          type: string
+                                        readOnly:
+                                          description: |-
+                                            Mounted read-only if true, read-write otherwise (false or unspecified).
+                                            Defaults to false.
+                                          type: boolean
+                                        recursiveReadOnly:
+                                          description: |-
+                                            RecursiveReadOnly specifies whether read-only mounts should be handled
+                                            recursively.
+
+                                            If ReadOnly is false, this field has no meaning and must be unspecified.
+
+                                            If ReadOnly is true, and this field is set to Disabled, the mount is not made
+                                            recursively read-only.  If this field is set to IfPossible, the mount is made
+                                            recursively read-only, if it is supported by the container runtime.  If this
+                                            field is set to Enabled, the mount is made recursively read-only if it is
+                                            supported by the container runtime, otherwise the pod will not be started and
+                                            an error will be generated to indicate the reason.
+
+                                            If this field is set to IfPossible or Enabled, MountPropagation must be set to
+                                            None (or be unspecified, which defaults to None).
+
+                                            If this field is not specified, it is treated as an equivalent of Disabled.
+                                          type: string
+                                        subPath:
+                                          description: |-
+                                            Path within the volume from which the container's volume should be mounted.
+                                            Defaults to "" (volume's root).
+                                          type: string
+                                        subPathExpr:
+                                          description: |-
+                                            Expanded path within the volume from which the container's volume should be mounted.
+                                            Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment.
+                                            Defaults to "" (volume's root).
+                                            SubPathExpr and SubPath are mutually exclusive.
+                                          type: string
+                                      required:
+                                      - mountPath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - mountPath
+                                    x-kubernetes-list-type: map
+                                  workingDir:
+                                    description: |-
+                                      Container's working directory.
+                                      If not specified, the container runtime's default will be used, which
+                                      might be configured in the container image.
+                                      Cannot be updated.
+                                    type: string
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            hostAliases:
+                              description: |-
+                                HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts
+                                file if specified.
+                              items:
+                                description: |-
+                                  HostAlias holds the mapping between IP and hostnames that will be injected as an entry in the
+                                  pod's hosts file.
+                                properties:
+                                  hostnames:
+                                    description: Hostnames for the above IP address.
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  ip:
+                                    description: IP address of the host file entry.
+                                    type: string
+                                required:
+                                - ip
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - ip
+                              x-kubernetes-list-type: map
+                            hostIPC:
+                              description: |-
+                                Use the host's ipc namespace.
+                                Optional: Default to false.
+                              type: boolean
+                            hostNetwork:
+                              description: |-
+                                Host networking requested for this pod. Use the host's network namespace.
+                                If this option is set, the ports that will be used must be specified.
+                                Default to false.
+                              type: boolean
+                            hostPID:
+                              description: |-
+                                Use the host's pid namespace.
+                                Optional: Default to false.
+                              type: boolean
+                            hostUsers:
+                              description: |-
+                                Use the host's user namespace.
+                                Optional: Default to true.
+                                If set to true or not present, the pod will be run in the host user namespace, useful
+                                for when the pod needs a feature only available to the host user namespace, such as
+                                loading a kernel module with CAP_SYS_MODULE.
+                                When set to false, a new userns is created for the pod. Setting false is useful for
+                                mitigating container breakout vulnerabilities even allowing users to run their
+                                containers as root without actually having root privileges on the host.
+                                This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.
+                              type: boolean
+                            hostname:
+                              description: |-
+                                Specifies the hostname of the Pod
+                                If not specified, the pod's hostname will be set to a system-defined value.
+                              type: string
+                            imagePullSecrets:
+                              description: |-
+                                ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec.
+                                If specified, these secrets will be passed to individual puller implementations for them to use.
+                                More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod
+                              items:
+                                description: |-
+                                  LocalObjectReference contains enough information to let you locate the
+                                  referenced object inside the same namespace.
+                                properties:
+                                  name:
+                                    default: ""
+                                    description: |-
+                                      Name of the referent.
+                                      This field is effectively required, but due to backwards compatibility is
+                                      allowed to be empty. Instances of this type with an empty value here are
+                                      almost certainly wrong.
+                                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                    type: string
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            initContainers:
+                              description: |-
+                                List of initialization containers belonging to the pod.
+                                Init containers are executed in order prior to containers being started. If any
+                                init container fails, the pod is considered to have failed and is handled according
+                                to its restartPolicy. The name for an init container or normal container must be
+                                unique among all containers.
+                                Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes.
+                                The resourceRequirements of an init container are taken into account during scheduling
+                                by finding the highest request/limit for each resource type, and then using the max of
+                                of that value or the sum of the normal containers. Limits are applied to init containers
+                                in a similar fashion.
+                                Init containers cannot currently be added or removed.
+                                Cannot be updated.
+                                More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/
+                              items:
+                                description: A single application container that you
+                                  want to run within a pod.
+                                properties:
+                                  args:
+                                    description: |-
+                                      Arguments to the entrypoint.
+                                      The container image's CMD is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  command:
+                                    description: |-
+                                      Entrypoint array. Not executed within a shell.
+                                      The container image's ENTRYPOINT is used if this is not provided.
+                                      Variable references $(VAR_NAME) are expanded using the container's environment. If a variable
+                                      cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced
+                                      to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                                      produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless
+                                      of whether the variable exists or not. Cannot be updated.
+                                      More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  env:
+                                    description: |-
+                                      List of environment variables to set in the container.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvVar represents an environment
+                                        variable present in a Container.
+                                      properties:
+                                        name:
+                                          description: Name of the environment variable.
+                                            Must be a C_IDENTIFIER.
+                                          type: string
+                                        value:
+                                          description: |-
+                                            Variable references $(VAR_NAME) are expanded
+                                            using the previously defined environment variables in the container and
+                                            any service environment variables. If a variable cannot be resolved,
+                                            the reference in the input string will be unchanged. Double $$ are reduced
+                                            to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.
+                                            "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                                            Escaped references will never be expanded, regardless of whether the variable
+                                            exists or not.
+                                            Defaults to "".
+                                          type: string
+                                        valueFrom:
+                                          description: Source for the environment
+                                            variable's value. Cannot be used if value
+                                            is not empty.
+                                          properties:
+                                            configMapKeyRef:
+                                              description: Selects a key of a ConfigMap.
+                                              properties:
+                                                key:
+                                                  description: The key to select.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    ConfigMap or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            fieldRef:
+                                              description: |-
+                                                Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['<KEY>']`, `metadata.annotations['<KEY>']`,
+                                                spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.
+                                              properties:
+                                                apiVersion:
+                                                  description: Version of the schema
+                                                    the FieldPath is written in terms
+                                                    of, defaults to "v1".
+                                                  type: string
+                                                fieldPath:
+                                                  description: Path of the field to
+                                                    select in the specified API version.
+                                                  type: string
+                                              required:
+                                              - fieldPath
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            resourceFieldRef:
+                                              description: |-
+                                                Selects a resource of the container: only resources limits and requests
+                                                (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.
+                                              properties:
+                                                containerName:
+                                                  description: 'Container name: required
+                                                    for volumes, optional for env
+                                                    vars'
+                                                  type: string
+                                                divisor:
+                                                  anyOf:
+                                                  - type: integer
+                                                  - type: string
+                                                  description: Specifies the output
+                                                    format of the exposed resources,
+                                                    defaults to "1"
+                                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                  x-kubernetes-int-or-string: true
+                                                resource:
+                                                  description: 'Required: resource
+                                                    to select'
+                                                  type: string
+                                              required:
+                                              - resource
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            secretKeyRef:
+                                              description: Selects a key of a secret
+                                                in the pod's namespace
+                                              properties:
+                                                key:
+                                                  description: The key of the secret
+                                                    to select from.  Must be a valid
+                                                    secret key.
+                                                  type: string
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: Specify whether the
+                                                    Secret or its key must be defined
+                                                  type: boolean
+                                              required:
+                                              - key
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                          type: object
+                                      required:
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - name
+                                    x-kubernetes-list-type: map
+                                  envFrom:
+                                    description: |-
+                                      List of sources to populate environment variables in the container.
+                                      The keys defined within a source must be a C_IDENTIFIER. All invalid keys
+                                      will be reported as an event when the container is starting. When a key exists in multiple
+                                      sources, the value associated with the last source will take precedence.
+                                      Values defined by an Env with a duplicate key will take precedence.
+                                      Cannot be updated.
+                                    items:
+                                      description: EnvFromSource represents the source
+                                        of a set of ConfigMaps
+                                      properties:
+                                        configMapRef:
+                                          description: The ConfigMap to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the ConfigMap
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                        prefix:
+                                          description: An optional identifier to prepend
+                                            to each key in the ConfigMap. Must be
+                                            a C_IDENTIFIER.
+                                          type: string
+                                        secretRef:
+                                          description: The Secret to select from
+                                          properties:
+                                            name:
+                                              default: ""
+                                              description: |-
+                                                Name of the referent.
+                                                This field is effectively required, but due to backwards compatibility is
+                                                allowed to be empty. Instances of this type with an empty value here are
+                                                almost certainly wrong.
+                                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                              type: string
+                                            optional:
+                                              description: Specify whether the Secret
+                                                must be defined
+                                              type: boolean
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  image:
+                                    description: |-
+                                      Container image name.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images
+                                      This field is optional to allow higher level config management to default or override
+                                      container images in workload controllers like Deployments and StatefulSets.
+                                    type: string
+                                  imagePullPolicy:
+                                    description: |-
+                                      Image pull policy.
+                                      One of Always, Never, IfNotPresent.
+                                      Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/containers/images#updating-images
+                                    type: string
+                                  lifecycle:
+                                    description: |-
+                                      Actions that the management system should take in response to container lifecycle events.
+                                      Cannot be updated.
+                                    properties:
+                                      postStart:
+                                        description: |-
+                                          PostStart is called immediately after a container is created. If the handler fails,
+                                          the container is terminated and restarted according to its restart policy.
+                                          Other management of the container blocks until the hook completes.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                      preStop:
+                                        description: |-
+                                          PreStop is called immediately before a container is terminated due to an
+                                          API request or management event such as liveness/startup probe failure,
+                                          preemption, resource contention, etc. The handler is not called if the
+                                          container crashes or exits. The Pod's termination grace period countdown begins before the
+                                          PreStop hook is executed. Regardless of the outcome of the handler, the
+                                          container will eventually terminate within the Pod's termination grace
+                                          period (unless delayed by finalizers). Other management of the container blocks until the hook completes
+                                          or until the termination grace period is reached.
+                                          More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks
+                                        properties:
+                                          exec:
+                                            description: Exec specifies the action
+                                              to take.
+                                            properties:
+                                              command:
+                                                description: |-
+                                                  Command is the command line to execute inside the container, the working directory for the
+                                                  command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                                  not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                                  a shell, you need to explicitly call out to that shell.
+                                                  Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                            type: object
+                                          httpGet:
+                                            description: HTTPGet specifies the http
+                                              request to perform.
+                                            properties:
+                                              host:
+                                                description: |-
+                                                  Host name to connect to, defaults to the pod IP. You probably want to set
+                                                  "Host" in httpHeaders instead.
+                                                type: string
+                                              httpHeaders:
+                                                description: Custom headers to set
+                                                  in the request. HTTP allows repeated
+                                                  headers.
+                                                items:
+                                                  description: HTTPHeader describes
+                                                    a custom header to be used in
+                                                    HTTP probes
+                                                  properties:
+                                                    name:
+                                                      description: |-
+                                                        The header field name.
+                                                        This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                      type: string
+                                                    value:
+                                                      description: The header field
+                                                        value
+                                                      type: string
+                                                  required:
+                                                  - name
+                                                  - value
+                                                  type: object
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              path:
+                                                description: Path to access on the
+                                                  HTTP server.
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Name or number of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                              scheme:
+                                                description: |-
+                                                  Scheme to use for connecting to the host.
+                                                  Defaults to HTTP.
+                                                type: string
+                                            required:
+                                            - port
+                                            type: object
+                                          sleep:
+                                            description: Sleep represents the duration
+                                              that the container should sleep before
+                                              being terminated.
+                                            properties:
+                                              seconds:
+                                                description: Seconds is the number
+                                                  of seconds to sleep.
+                                                format: int64
+                                                type: integer
+                                            required:
+                                            - seconds
+                                            type: object
+                                          tcpSocket:
+                                            description: |-
+                                              Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept
+                                              for the backward compatibility. There are no validation of this field and
+                                              lifecycle hooks will fail in runtime when tcp handler is specified.
+                                            properties:
+                                              host:
+                                                description: 'Optional: Host name
+                                                  to connect to, defaults to the pod
+                                                  IP.'
+                                                type: string
+                                              port:
+                                                anyOf:
+                                                - type: integer
+                                                - type: string
+                                                description: |-
+                                                  Number or name of the port to access on the container.
+                                                  Number must be in the range 1 to 65535.
+                                                  Name must be an IANA_SVC_NAME.
+                                                x-kubernetes-int-or-string: true
+                                            required:
+                                            - port
+                                            type: object
+                                        type: object
+                                    type: object
+                                  livenessProbe:
+                                    description: |-
+                                      Periodic probe of container liveness.
+                                      Container will be restarted if the probe fails.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  name:
+                                    description: |-
+                                      Name of the container specified as a DNS_LABEL.
+                                      Each container in a pod must have a unique name (DNS_LABEL).
+                                      Cannot be updated.
+                                    type: string
+                                  ports:
+                                    description: |-
+                                      List of ports to expose from the container. Not specifying a port here
+                                      DOES NOT prevent that port from being exposed. Any port which is
+                                      listening on the default "0.0.0.0" address inside a container will be
+                                      accessible from the network.
+                                      Modifying this array with strategic merge patch may corrupt the data.
+                                      For more information See https://github.com/kubernetes/kubernetes/issues/108255.
+                                      Cannot be updated.
+                                    items:
+                                      description: ContainerPort represents a network
+                                        port in a single container.
+                                      properties:
+                                        containerPort:
+                                          description: |-
+                                            Number of port to expose on the pod's IP address.
+                                            This must be a valid port number, 0 < x < 65536.
+                                          format: int32
+                                          type: integer
+                                        hostIP:
+                                          description: What host IP to bind the external
+                                            port to.
+                                          type: string
+                                        hostPort:
+                                          description: |-
+                                            Number of port to expose on the host.
+                                            If specified, this must be a valid port number, 0 < x < 65536.
+                                            If HostNetwork is specified, this must match ContainerPort.
+                                            Most containers do not need this.
+                                          format: int32
+                                          type: integer
+                                        name:
+                                          description: |-
+                                            If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
+                                            named port in a pod must have a unique name. Name for the port that can be
+                                            referred to by services.
+                                          type: string
+                                        protocol:
+                                          default: TCP
+                                          description: |-
+                                            Protocol for port. Must be UDP, TCP, or SCTP.
+                                            Defaults to "TCP".
+                                          type: string
+                                      required:
+                                      - containerPort
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - containerPort
+                                    - protocol
+                                    x-kubernetes-list-type: map
+                                  readinessProbe:
+                                    description: |-
+                                      Periodic probe of container service readiness.
+                                      Container will be removed from service endpoints if the probe fails.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  resizePolicy:
+                                    description: Resources resize policy for the container.
+                                    items:
+                                      description: ContainerResizePolicy represents
+                                        resource resize policy for the container.
+                                      properties:
+                                        resourceName:
+                                          description: |-
+                                            Name of the resource to which this resource resize policy applies.
+                                            Supported values: cpu, memory.
+                                          type: string
+                                        restartPolicy:
+                                          description: |-
+                                            Restart policy to apply when specified resource is resized.
+                                            If not specified, it defaults to NotRequired.
+                                          type: string
+                                      required:
+                                      - resourceName
+                                      - restartPolicy
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  resources:
+                                    description: |-
+                                      Compute Resources required by this container.
+                                      Cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                    properties:
+                                      claims:
+                                        description: |-
+                                          Claims lists the names of resources, defined in spec.resourceClaims,
+                                          that are used by this container.
+
+                                          This is an alpha field and requires enabling the
+                                          DynamicResourceAllocation feature gate.
+
+                                          This field is immutable. It can only be set for containers.
+                                        items:
+                                          description: ResourceClaim references one
+                                            entry in PodSpec.ResourceClaims.
+                                          properties:
+                                            name:
+                                              description: |-
+                                                Name must match the name of one entry in pod.spec.resourceClaims of
+                                                the Pod where this field is used. It makes that resource available
+                                                inside a container.
+                                              type: string
+                                            request:
+                                              description: |-
+                                                Request is the name chosen for a request in the referenced claim.
+                                                If empty, everything from the claim is made available, otherwise
+                                                only the result of this request.
+                                              type: string
+                                          required:
+                                          - name
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-map-keys:
+                                        - name
+                                        x-kubernetes-list-type: map
+                                      limits:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Limits describes the maximum amount of compute resources allowed.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                      requests:
+                                        additionalProperties:
+                                          anyOf:
+                                          - type: integer
+                                          - type: string
+                                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                          x-kubernetes-int-or-string: true
+                                        description: |-
+                                          Requests describes the minimum amount of compute resources required.
+                                          If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                          otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                        type: object
+                                    type: object
+                                  restartPolicy:
+                                    description: |-
+                                      RestartPolicy defines the restart behavior of individual containers in a pod.
+                                      This field may only be set for init containers, and the only allowed value is "Always".
+                                      For non-init containers or when this field is not specified,
+                                      the restart behavior is defined by the Pod's restart policy and the container type.
+                                      Setting the RestartPolicy as "Always" for the init container will have the following effect:
+                                      this init container will be continually restarted on
+                                      exit until all regular containers have terminated. Once all regular
+                                      containers have completed, all init containers with restartPolicy "Always"
+                                      will be shut down. This lifecycle differs from normal init containers and
+                                      is often referred to as a "sidecar" container. Although this init
+                                      container still starts in the init container sequence, it does not wait
+                                      for the container to complete before proceeding to the next init
+                                      container. Instead, the next init container starts immediately after this
+                                      init container is started, or after any startupProbe has successfully
+                                      completed.
+                                    type: string
+                                  securityContext:
+                                    description: |-
+                                      SecurityContext defines the security options the container should be run with.
+                                      If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext.
+                                      More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
+                                    properties:
+                                      allowPrivilegeEscalation:
+                                        description: |-
+                                          AllowPrivilegeEscalation controls whether a process can gain more
+                                          privileges than its parent process. This bool directly controls if
+                                          the no_new_privs flag will be set on the container process.
+                                          AllowPrivilegeEscalation is true always when the container is:
+                                          1) run as Privileged
+                                          2) has CAP_SYS_ADMIN
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      appArmorProfile:
+                                        description: |-
+                                          appArmorProfile is the AppArmor options to use by this container. If set, this profile
+                                          overrides the pod's appArmorProfile.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile loaded on the node that should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must match the loaded name of the profile.
+                                              Must be set if and only if type is "Localhost".
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of AppArmor profile will be applied.
+                                              Valid options are:
+                                                Localhost - a profile pre-loaded on the node.
+                                                RuntimeDefault - the container runtime's default profile.
+                                                Unconfined - no AppArmor enforcement.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      capabilities:
+                                        description: |-
+                                          The capabilities to add/drop when running containers.
+                                          Defaults to the default set of capabilities granted by the container runtime.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          add:
+                                            description: Added capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          drop:
+                                            description: Removed capabilities
+                                            items:
+                                              description: Capability represent POSIX
+                                                capabilities type
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      privileged:
+                                        description: |-
+                                          Run container in privileged mode.
+                                          Processes in privileged containers are essentially equivalent to root on the host.
+                                          Defaults to false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      procMount:
+                                        description: |-
+                                          procMount denotes the type of proc mount to use for the containers.
+                                          The default value is Default which uses the container runtime defaults for
+                                          readonly paths and masked paths.
+                                          This requires the ProcMountType feature flag to be enabled.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: string
+                                      readOnlyRootFilesystem:
+                                        description: |-
+                                          Whether this container has a read-only root filesystem.
+                                          Default is false.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        type: boolean
+                                      runAsGroup:
+                                        description: |-
+                                          The GID to run the entrypoint of the container process.
+                                          Uses runtime default if unset.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      runAsNonRoot:
+                                        description: |-
+                                          Indicates that the container must run as a non-root user.
+                                          If true, the Kubelet will validate the image at runtime to ensure that it
+                                          does not run as UID 0 (root) and fail to start the container if it does.
+                                          If unset or false, no such validation will be performed.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                        type: boolean
+                                      runAsUser:
+                                        description: |-
+                                          The UID to run the entrypoint of the container process.
+                                          Defaults to user specified in image metadata if unspecified.
+                                          May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        format: int64
+                                        type: integer
+                                      seLinuxOptions:
+                                        description: |-
+                                          The SELinux context to be applied to the container.
+                                          If unspecified, the container runtime will allocate a random SELinux context for each
+                                          container.  May also be set in PodSecurityContext.  If set in both SecurityContext and
+                                          PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          level:
+                                            description: Level is SELinux level label
+                                              that applies to the container.
+                                            type: string
+                                          role:
+                                            description: Role is a SELinux role label
+                                              that applies to the container.
+                                            type: string
+                                          type:
+                                            description: Type is a SELinux type label
+                                              that applies to the container.
+                                            type: string
+                                          user:
+                                            description: User is a SELinux user label
+                                              that applies to the container.
+                                            type: string
+                                        type: object
+                                      seccompProfile:
+                                        description: |-
+                                          The seccomp options to use by this container. If seccomp options are
+                                          provided at both the pod & container level, the container options
+                                          override the pod options.
+                                          Note that this field cannot be set when spec.os.name is windows.
+                                        properties:
+                                          localhostProfile:
+                                            description: |-
+                                              localhostProfile indicates a profile defined in a file on the node should be used.
+                                              The profile must be preconfigured on the node to work.
+                                              Must be a descending path, relative to the kubelet's configured seccomp profile location.
+                                              Must be set if type is "Localhost". Must NOT be set for any other type.
+                                            type: string
+                                          type:
+                                            description: |-
+                                              type indicates which kind of seccomp profile will be applied.
+                                              Valid options are:
+
+                                              Localhost - a profile defined in a file on the node should be used.
+                                              RuntimeDefault - the container runtime default profile should be used.
+                                              Unconfined - no profile should be applied.
+                                            type: string
+                                        required:
+                                        - type
+                                        type: object
+                                      windowsOptions:
+                                        description: |-
+                                          The Windows specific settings applied to all containers.
+                                          If unspecified, the options from the PodSecurityContext will be used.
+                                          If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                          Note that this field cannot be set when spec.os.name is linux.
+                                        properties:
+                                          gmsaCredentialSpec:
+                                            description: |-
+                                              GMSACredentialSpec is where the GMSA admission webhook
+                                              (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the
+                                              GMSA credential spec named by the GMSACredentialSpecName field.
+                                            type: string
+                                          gmsaCredentialSpecName:
+                                            description: GMSACredentialSpecName is
+                                              the name of the GMSA credential spec
+                                              to use.
+                                            type: string
+                                          hostProcess:
+                                            description: |-
+                                              HostProcess determines if a container should be run as a 'Host Process' container.
+                                              All of a Pod's containers must have the same effective HostProcess value
+                                              (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).
+                                              In addition, if HostProcess is true then HostNetwork must also be set to true.
+                                            type: boolean
+                                          runAsUserName:
+                                            description: |-
+                                              The UserName in Windows to run the entrypoint of the container process.
+                                              Defaults to the user specified in image metadata if unspecified.
+                                              May also be set in PodSecurityContext. If set in both SecurityContext and
+                                              PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                            type: string
+                                        type: object
+                                    type: object
+                                  startupProbe:
+                                    description: |-
+                                      StartupProbe indicates that the Pod has successfully initialized.
+                                      If specified, no other probes are executed until this completes successfully.
+                                      If this probe fails, the Pod will be restarted, just as if the livenessProbe failed.
+                                      This can be used to provide different probe parameters at the beginning of a Pod's lifecycle,
+                                      when it might take a long time to load data or warm a cache, than during steady-state operation.
+                                      This cannot be updated.
+                                      More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                    properties:
+                                      exec:
+                                        description: Exec specifies the action to
+                                          take.
+                                        properties:
+                                          command:
+                                            description: |-
+                                              Command is the command line to execute inside the container, the working directory for the
+                                              command  is root ('/') in the container's filesystem. The command is simply exec'd, it is
+                                              not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use
+                                              a shell, you need to explicitly call out to that shell.
+                                              Exit status of 0 is treated as live/healthy and non-zero is unhealthy.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        type: object
+                                      failureThreshold:
+                                        description: |-
+                                          Minimum consecutive failures for the probe to be considered failed after having succeeded.
+                                          Defaults to 3. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      grpc:
+                                        description: GRPC specifies an action involving
+                                          a GRPC port.
+                                        properties:
+                                          port:
+                                            description: Port number of the gRPC service.
+                                              Number must be in the range 1 to 65535.
+                                            format: int32
+                                            type: integer
+                                          service:
+                                            default: ""
+                                            description: |-
+                                              Service is the name of the service to place in the gRPC HealthCheckRequest
+                                              (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+
+                                              If this is not specified, the default behavior is defined by gRPC.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      httpGet:
+                                        description: HTTPGet specifies the http request
+                                          to perform.
+                                        properties:
+                                          host:
+                                            description: |-
+                                              Host name to connect to, defaults to the pod IP. You probably want to set
+                                              "Host" in httpHeaders instead.
+                                            type: string
+                                          httpHeaders:
+                                            description: Custom headers to set in
+                                              the request. HTTP allows repeated headers.
+                                            items:
+                                              description: HTTPHeader describes a
+                                                custom header to be used in HTTP probes
+                                              properties:
+                                                name:
+                                                  description: |-
+                                                    The header field name.
+                                                    This will be canonicalized upon output, so case-variant names will be understood as the same header.
+                                                  type: string
+                                                value:
+                                                  description: The header field value
+                                                  type: string
+                                              required:
+                                              - name
+                                              - value
+                                              type: object
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                          path:
+                                            description: Path to access on the HTTP
+                                              server.
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Name or number of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                          scheme:
+                                            description: |-
+                                              Scheme to use for connecting to the host.
+                                              Defaults to HTTP.
+                                            type: string
+                                        required:
+                                        - port
+                                        type: object
+                                      initialDelaySeconds:
+                                        description: |-
+                                          Number of seconds after the container has started before liveness probes are initiated.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                      periodSeconds:
+                                        description: |-
+                                          How often (in seconds) to perform the probe.
+                                          Default to 10 seconds. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      successThreshold:
+                                        description: |-
+                                          Minimum consecutive successes for the probe to be considered successful after having failed.
+                                          Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.
+                                        format: int32
+                                        type: integer
+                                      tcpSocket:
+                                        description: TCPSocket specifies an action
+                                          involving a TCP port.
+                                        properties:
+                                          host:
+                                            description: 'Optional: Host name to connect
+                                              to, defaults to the pod IP.'
+                                            type: string
+                                          port:
+                                            anyOf:
+                                            - type: integer
+                                            - type: string
+                                            description: |-
+                                              Number or name of the port to access on the container.
+                                              Number must be in the range 1 to 65535.
+                                              Name must be an IANA_SVC_NAME.
+                                            x-kubernetes-int-or-string: true
+                                        required:
+                                        - port
+                                        type: object
+                                      terminationGracePeriodSeconds:
+                                        description: |-
+                                          Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
+                                          The grace period is the duration in seconds after the processes running in the pod are sent
+                                          a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                          Set this value longer than the expected cleanup time for your process.
+                                          If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
+                                          value overrides the value provided by the pod spec.
+                                          Value must be non-negative integer. The value zero indicates stop immediately via
+                                          the kill signal (no opportunity to shut down).
+                                          This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
+                                          Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.
+                                        format: int64
+                                        type: integer
+                                      timeoutSeconds:
+                                        description: |-
+                                          Number of seconds after which the probe times out.
+                                          Defaults to 1 second. Minimum value is 1.
+                                          More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes
+                                        format: int32
+                                        type: integer
+                                    type: object
+                                  stdin:
+                                    description: |-
+                                      Whether this container should allocate a buffer for stdin in the container runtime. If this
+                                      is not set, reads from stdin in the container will always result in EOF.
+                                      Default is false.
+                                    type: boolean
+                                  stdinOnce:
+                                    description: |-
+                                      Whether the container runtime should close the stdin channel after it has been opened by
+                                      a single attach. When stdin is true the stdin stream will remain open across multiple attach
+                                      sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the
+                                      first client attaches to stdin, and then remains open and accepts data until the client disconnects,
+                                      at which time stdin is closed and remains closed until the container is restarted. If this
+                                      flag is false, a container processes that reads from stdin will never receive an EOF.
+                                      Default is false
+                                    type: boolean
+                                  terminationMessagePath:
+                                    description: |-
+                                      Optional: Path at which the file to which the container's termination message
+                                      will be written is mounted into the container's filesystem.
+                                      Message written is intended to be brief final status, such as an assertion failure message.
+                                      Will be truncated by the node if greater than 4096 bytes. The total message length across
+                                      all containers will be limited to 12kb.
+                                      Defaults to /dev/termination-log.
+                                      Cannot be updated.
+                                    type: string
+                                  terminationMessagePolicy:
+                                    description: |-
+                                      Indicate how the termination message should be populated. File will use the contents of
+                                      terminationMessagePath to populate the container status message on both success and failure.
+                                      FallbackToLogsOnError will use the last chunk of container log output if the termination
+                                      message file is empty and the container exited with an error.
+                                      The log output is limited to 2048 bytes or 80 lines, whichever is smaller.
+                                      Defaults to File.
+                                      Cannot be updated.
+                                    type: string
+                                  tty:
+                                    description: |-
+                                      Whether this container should allocate a TTY for itself, also requires 'stdin' to be true.
+                                      Default is false.
+                                    type: boolean
+                                  volumeDevices:
+                                    description: volumeDevices is the list of block
+                                      devices to be used by the container.
+                                    items:
+                                      description: volumeDevice describes a mapping
+                                        of a raw block device within a container.
+                                      properties:
+                                        devicePath:
+                                          description: devicePath is the path inside
+                                            of the container that the device will
+                                            be mapped to.
+                                          type: string
+                                        name:
+                                          description: name must match the name of
+                                            a persistentVolumeClaim in the pod
+                                          type: string
+                                      required:
+                                      - devicePath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - devicePath
+                                    x-kubernetes-list-type: map
+                                  volumeMounts:
+                                    description: |-
+                                      Pod volumes to mount into the container's filesystem.
+                                      Cannot be updated.
+                                    items:
+                                      description: VolumeMount describes a mounting
+                                        of a Volume within a container.
+                                      properties:
+                                        mountPath:
+                                          description: |-
+                                            Path within the container at which the volume should be mounted.  Must
+                                            not contain ':'.
+                                          type: string
+                                        mountPropagation:
+                                          description: |-
+                                            mountPropagation determines how mounts are propagated from the host
+                                            to container and the other way around.
+                                            When not set, MountPropagationNone is used.
+                                            This field is beta in 1.10.
+                                            When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified
+                                            (which defaults to None).
+                                          type: string
+                                        name:
+                                          description: This must match the Name of
+                                            a Volume.
+                                          type: string
+                                        readOnly:
+                                          description: |-
+                                            Mounted read-only if true, read-write otherwise (false or unspecified).
+                                            Defaults to false.
+                                          type: boolean
+                                        recursiveReadOnly:
+                                          description: |-
+                                            RecursiveReadOnly specifies whether read-only mounts should be handled
+                                            recursively.
+
+                                            If ReadOnly is false, this field has no meaning and must be unspecified.
+
+                                            If ReadOnly is true, and this field is set to Disabled, the mount is not made
+                                            recursively read-only.  If this field is set to IfPossible, the mount is made
+                                            recursively read-only, if it is supported by the container runtime.  If this
+                                            field is set to Enabled, the mount is made recursively read-only if it is
+                                            supported by the container runtime, otherwise the pod will not be started and
+                                            an error will be generated to indicate the reason.
+
+                                            If this field is set to IfPossible or Enabled, MountPropagation must be set to
+                                            None (or be unspecified, which defaults to None).
+
+                                            If this field is not specified, it is treated as an equivalent of Disabled.
+                                          type: string
+                                        subPath:
+                                          description: |-
+                                            Path within the volume from which the container's volume should be mounted.
+                                            Defaults to "" (volume's root).
+                                          type: string
+                                        subPathExpr:
+                                          description: |-
+                                            Expanded path within the volume from which the container's volume should be mounted.
+                                            Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment.
+                                            Defaults to "" (volume's root).
+                                            SubPathExpr and SubPath are mutually exclusive.
+                                          type: string
+                                      required:
+                                      - mountPath
+                                      - name
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-map-keys:
+                                    - mountPath
+                                    x-kubernetes-list-type: map
+                                  workingDir:
+                                    description: |-
+                                      Container's working directory.
+                                      If not specified, the container runtime's default will be used, which
+                                      might be configured in the container image.
+                                      Cannot be updated.
+                                    type: string
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            nodeName:
+                              description: |-
+                                NodeName indicates in which node this pod is scheduled.
+                                If empty, this pod is a candidate for scheduling by the scheduler defined in schedulerName.
+                                Once this field is set, the kubelet for this node becomes responsible for the lifecycle of this pod.
+                                This field should not be used to express a desire for the pod to be scheduled on a specific node.
+                                https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodename
+                              type: string
+                            nodeSelector:
+                              additionalProperties:
+                                type: string
+                              description: |-
+                                NodeSelector is a selector which must be true for the pod to fit on a node.
+                                Selector which must match a node's labels for the pod to be scheduled on that node.
+                                More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            os:
+                              description: |-
+                                Specifies the OS of the containers in the pod.
+                                Some pod and container fields are restricted if this is set.
+
+                                If the OS field is set to linux, the following fields must be unset:
+                                -securityContext.windowsOptions
+
+                                If the OS field is set to windows, following fields must be unset:
+                                - spec.hostPID
+                                - spec.hostIPC
+                                - spec.hostUsers
+                                - spec.securityContext.appArmorProfile
+                                - spec.securityContext.seLinuxOptions
+                                - spec.securityContext.seccompProfile
+                                - spec.securityContext.fsGroup
+                                - spec.securityContext.fsGroupChangePolicy
+                                - spec.securityContext.sysctls
+                                - spec.shareProcessNamespace
+                                - spec.securityContext.runAsUser
+                                - spec.securityContext.runAsGroup
+                                - spec.securityContext.supplementalGroups
+                                - spec.securityContext.supplementalGroupsPolicy
+                                - spec.containers[*].securityContext.appArmorProfile
+                                - spec.containers[*].securityContext.seLinuxOptions
+                                - spec.containers[*].securityContext.seccompProfile
+                                - spec.containers[*].securityContext.capabilities
+                                - spec.containers[*].securityContext.readOnlyRootFilesystem
+                                - spec.containers[*].securityContext.privileged
+                                - spec.containers[*].securityContext.allowPrivilegeEscalation
+                                - spec.containers[*].securityContext.procMount
+                                - spec.containers[*].securityContext.runAsUser
+                                - spec.containers[*].securityContext.runAsGroup
+                              properties:
+                                name:
+                                  description: |-
+                                    Name is the name of the operating system. The currently supported values are linux and windows.
+                                    Additional value may be defined in future and can be one of:
+                                    https://github.com/opencontainers/runtime-spec/blob/master/config.md#platform-specific-configuration
+                                    Clients should expect to handle additional values and treat unrecognized values in this field as os: null
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            overhead:
+                              additionalProperties:
+                                anyOf:
+                                - type: integer
+                                - type: string
+                                pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                x-kubernetes-int-or-string: true
+                              description: |-
+                                Overhead represents the resource overhead associated with running a pod for a given RuntimeClass.
+                                This field will be autopopulated at admission time by the RuntimeClass admission controller. If
+                                the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests.
+                                The RuntimeClass admission controller will reject Pod create requests which have the overhead already
+                                set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value
+                                defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero.
+                                More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md
+                              type: object
+                            preemptionPolicy:
+                              description: |-
+                                PreemptionPolicy is the Policy for preempting pods with lower priority.
+                                One of Never, PreemptLowerPriority.
+                                Defaults to PreemptLowerPriority if unset.
+                              type: string
+                            priority:
+                              description: |-
+                                The priority value. Various system components use this field to find the
+                                priority of the pod. When Priority Admission Controller is enabled, it
+                                prevents users from setting this field. The admission controller populates
+                                this field from PriorityClassName.
+                                The higher the value, the higher the priority.
+                              format: int32
+                              type: integer
+                            priorityClassName:
+                              description: |-
+                                If specified, indicates the pod's priority. "system-node-critical" and
+                                "system-cluster-critical" are two special keywords which indicate the
+                                highest priorities with the former being the highest priority. Any other
+                                name must be defined by creating a PriorityClass object with that name.
+                                If not specified, the pod priority will be default or zero if there is no
+                                default.
+                              type: string
+                            readinessGates:
+                              description: |-
+                                If specified, all readiness gates will be evaluated for pod readiness.
+                                A pod is ready when all its containers are ready AND
+                                all conditions specified in the readiness gates have status equal to "True"
+                                More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates
+                              items:
+                                description: PodReadinessGate contains the reference
+                                  to a pod condition
+                                properties:
+                                  conditionType:
+                                    description: ConditionType refers to a condition
+                                      in the pod's condition list with matching type.
+                                    type: string
+                                required:
+                                - conditionType
+                                type: object
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            resourceClaims:
+                              description: |-
+                                ResourceClaims defines which ResourceClaims must be allocated
+                                and reserved before the Pod is allowed to start. The resources
+                                will be made available to those containers which consume them
+                                by name.
+
+                                This is an alpha field and requires enabling the
+                                DynamicResourceAllocation feature gate.
+
+                                This field is immutable.
+                              items:
+                                description: |-
+                                  PodResourceClaim references exactly one ResourceClaim, either directly
+                                  or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim
+                                  for the pod.
+
+                                  It adds a name to it that uniquely identifies the ResourceClaim inside the Pod.
+                                  Containers that need access to the ResourceClaim reference it with this name.
+                                properties:
+                                  name:
+                                    description: |-
+                                      Name uniquely identifies this resource claim inside the pod.
+                                      This must be a DNS_LABEL.
+                                    type: string
+                                  resourceClaimName:
+                                    description: |-
+                                      ResourceClaimName is the name of a ResourceClaim object in the same
+                                      namespace as this pod.
+
+                                      Exactly one of ResourceClaimName and ResourceClaimTemplateName must
+                                      be set.
+                                    type: string
+                                  resourceClaimTemplateName:
+                                    description: |-
+                                      ResourceClaimTemplateName is the name of a ResourceClaimTemplate
+                                      object in the same namespace as this pod.
+
+                                      The template will be used to create a new ResourceClaim, which will
+                                      be bound to this pod. When this pod is deleted, the ResourceClaim
+                                      will also be deleted. The pod name and resource name, along with a
+                                      generated component, will be used to form a unique name for the
+                                      ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses.
+
+                                      This field is immutable and no changes will be made to the
+                                      corresponding ResourceClaim by the control plane after creating the
+                                      ResourceClaim.
+
+                                      Exactly one of ResourceClaimName and ResourceClaimTemplateName must
+                                      be set.
+                                    type: string
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            restartPolicy:
+                              description: |-
+                                Restart policy for all containers within the pod.
+                                One of Always, OnFailure, Never. In some contexts, only a subset of those values may be permitted.
+                                Default to Always.
+                                More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy
+                              type: string
+                            runtimeClassName:
+                              description: |-
+                                RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used
+                                to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run.
+                                If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an
+                                empty definition that uses the default runtime handler.
+                                More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class
+                              type: string
+                            schedulerName:
+                              description: |-
+                                If specified, the pod will be dispatched by specified scheduler.
+                                If not specified, the pod will be dispatched by default scheduler.
+                              type: string
+                            schedulingGates:
+                              description: |-
+                                SchedulingGates is an opaque list of values that if specified will block scheduling the pod.
+                                If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the
+                                scheduler will not attempt to schedule the pod.
+
+                                SchedulingGates can only be set at pod creation time, and be removed only afterwards.
+                              items:
+                                description: PodSchedulingGate is associated to a
+                                  Pod to guard its scheduling.
+                                properties:
+                                  name:
+                                    description: |-
+                                      Name of the scheduling gate.
+                                      Each scheduling gate must have a unique name field.
+                                    type: string
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                            securityContext:
+                              description: |-
+                                SecurityContext holds pod-level security attributes and common container settings.
+                                Optional: Defaults to empty.  See type description for default values of each field.
+                              properties:
+                                appArmorProfile:
+                                  description: |-
+                                    appArmorProfile is the AppArmor options to use by the containers in this pod.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  properties:
+                                    localhostProfile:
+                                      description: |-
+                                        localhostProfile indicates a profile loaded on the node that should be used.
+                                        The profile must be preconfigured on the node to work.
+                                        Must match the loaded name of the profile.
+                                        Must be set if and only if type is "Localhost".
+                                      type: string
+                                    type:
+                                      description: |-
+                                        type indicates which kind of AppArmor profile will be applied.
+                                        Valid options are:
+                                          Localhost - a profile pre-loaded on the node.
+                                          RuntimeDefault - the container runtime's default profile.
+                                          Unconfined - no AppArmor enforcement.
+                                      type: string
+                                  required:
+                                  - type
+                                  type: object
+                                fsGroup:
+                                  description: |-
+                                    A special supplemental group that applies to all containers in a pod.
+                                    Some volume types allow the Kubelet to change the ownership of that volume
+                                    to be owned by the pod:
+
+                                    1. The owning GID will be the FSGroup
+                                    2. The setgid bit is set (new files created in the volume will be owned by FSGroup)
+                                    3. The permission bits are OR'd with rw-rw----
+
+                                    If unset, the Kubelet will not modify the ownership and permissions of any volume.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  format: int64
+                                  type: integer
+                                fsGroupChangePolicy:
+                                  description: |-
+                                    fsGroupChangePolicy defines behavior of changing ownership and permission of the volume
+                                    before being exposed inside Pod. This field will only apply to
+                                    volume types which support fsGroup based ownership(and permissions).
+                                    It will have no effect on ephemeral volume types such as: secret, configmaps
+                                    and emptydir.
+                                    Valid values are "OnRootMismatch" and "Always". If not specified, "Always" is used.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  type: string
+                                runAsGroup:
+                                  description: |-
+                                    The GID to run the entrypoint of the container process.
+                                    Uses runtime default if unset.
+                                    May also be set in SecurityContext.  If set in both SecurityContext and
+                                    PodSecurityContext, the value specified in SecurityContext takes precedence
+                                    for that container.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  format: int64
+                                  type: integer
+                                runAsNonRoot:
+                                  description: |-
+                                    Indicates that the container must run as a non-root user.
+                                    If true, the Kubelet will validate the image at runtime to ensure that it
+                                    does not run as UID 0 (root) and fail to start the container if it does.
+                                    If unset or false, no such validation will be performed.
+                                    May also be set in SecurityContext.  If set in both SecurityContext and
+                                    PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                  type: boolean
+                                runAsUser:
+                                  description: |-
+                                    The UID to run the entrypoint of the container process.
+                                    Defaults to user specified in image metadata if unspecified.
+                                    May also be set in SecurityContext.  If set in both SecurityContext and
+                                    PodSecurityContext, the value specified in SecurityContext takes precedence
+                                    for that container.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  format: int64
+                                  type: integer
+                                seLinuxOptions:
+                                  description: |-
+                                    The SELinux context to be applied to all containers.
+                                    If unspecified, the container runtime will allocate a random SELinux context for each
+                                    container.  May also be set in SecurityContext.  If set in
+                                    both SecurityContext and PodSecurityContext, the value specified in SecurityContext
+                                    takes precedence for that container.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  properties:
+                                    level:
+                                      description: Level is SELinux level label that
+                                        applies to the container.
+                                      type: string
+                                    role:
+                                      description: Role is a SELinux role label that
+                                        applies to the container.
+                                      type: string
+                                    type:
+                                      description: Type is a SELinux type label that
+                                        applies to the container.
+                                      type: string
+                                    user:
+                                      description: User is a SELinux user label that
+                                        applies to the container.
+                                      type: string
+                                  type: object
+                                seccompProfile:
+                                  description: |-
+                                    The seccomp options to use by the containers in this pod.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  properties:
+                                    localhostProfile:
+                                      description: |-
+                                        localhostProfile indicates a profile defined in a file on the node should be used.
+                                        The profile must be preconfigured on the node to work.
+                                        Must be a descending path, relative to the kubelet's configured seccomp profile location.
+                                        Must be set if type is "Localhost". Must NOT be set for any other type.
+                                      type: string
+                                    type:
+                                      description: |-
+                                        type indicates which kind of seccomp profile will be applied.
+                                        Valid options are:
+
+                                        Localhost - a profile defined in a file on the node should be used.
+                                        RuntimeDefault - the container runtime default profile should be used.
+                                        Unconfined - no profile should be applied.
+                                      type: string
+                                  required:
+                                  - type
+                                  type: object
+                                supplementalGroups:
+                                  description: |-
+                                    A list of groups applied to the first process run in each container, in
+                                    addition to the container's primary GID and fsGroup (if specified).  If
+                                    the SupplementalGroupsPolicy feature is enabled, the
+                                    supplementalGroupsPolicy field determines whether these are in addition
+                                    to or instead of any group memberships defined in the container image.
+                                    If unspecified, no additional groups are added, though group memberships
+                                    defined in the container image may still be used, depending on the
+                                    supplementalGroupsPolicy field.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  items:
+                                    format: int64
+                                    type: integer
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                supplementalGroupsPolicy:
+                                  description: |-
+                                    Defines how supplemental groups of the first container processes are calculated.
+                                    Valid values are "Merge" and "Strict". If not specified, "Merge" is used.
+                                    (Alpha) Using the field requires the SupplementalGroupsPolicy feature gate to be enabled
+                                    and the container runtime must implement support for this feature.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  type: string
+                                sysctls:
+                                  description: |-
+                                    Sysctls hold a list of namespaced sysctls used for the pod. Pods with unsupported
+                                    sysctls (by the container runtime) might fail to launch.
+                                    Note that this field cannot be set when spec.os.name is windows.
+                                  items:
+                                    description: Sysctl defines a kernel parameter
+                                      to be set
+                                    properties:
+                                      name:
+                                        description: Name of a property to set
+                                        type: string
+                                      value:
+                                        description: Value of a property to set
+                                        type: string
+                                    required:
+                                    - name
+                                    - value
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                windowsOptions:
+                                  description: |-
+                                    The Windows specific settings applied to all containers.
+                                    If unspecified, the options within a container's SecurityContext will be used.
+                                    If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                    Note that this field cannot be set when spec.os.name is linux.
+                                  properties:
+                                    gmsaCredentialSpec:
+                                      description: |-
+                                        GMSACredentialSpec is where the GMSA admission webhook
+                                        (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the
+                                        GMSA credential spec named by the GMSACredentialSpecName field.
+                                      type: string
+                                    gmsaCredentialSpecName:
+                                      description: GMSACredentialSpecName is the name
+                                        of the GMSA credential spec to use.
+                                      type: string
+                                    hostProcess:
+                                      description: |-
+                                        HostProcess determines if a container should be run as a 'Host Process' container.
+                                        All of a Pod's containers must have the same effective HostProcess value
+                                        (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).
+                                        In addition, if HostProcess is true then HostNetwork must also be set to true.
+                                      type: boolean
+                                    runAsUserName:
+                                      description: |-
+                                        The UserName in Windows to run the entrypoint of the container process.
+                                        Defaults to the user specified in image metadata if unspecified.
+                                        May also be set in PodSecurityContext. If set in both SecurityContext and
+                                        PodSecurityContext, the value specified in SecurityContext takes precedence.
+                                      type: string
+                                  type: object
+                              type: object
+                            serviceAccount:
+                              description: |-
+                                DeprecatedServiceAccount is a deprecated alias for ServiceAccountName.
+                                Deprecated: Use serviceAccountName instead.
+                              type: string
+                            serviceAccountName:
+                              description: |-
+                                ServiceAccountName is the name of the ServiceAccount to use to run this pod.
+                                More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/
+                              type: string
+                            setHostnameAsFQDN:
+                              description: |-
+                                If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default).
+                                In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname).
+                                In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN.
+                                If a pod does not have FQDN, this has no effect.
+                                Default to false.
+                              type: boolean
+                            shareProcessNamespace:
+                              description: |-
+                                Share a single process namespace between all of the containers in a pod.
+                                When this is set containers will be able to view and signal processes from other containers
+                                in the same pod, and the first process in each container will not be assigned PID 1.
+                                HostPID and ShareProcessNamespace cannot both be set.
+                                Optional: Default to false.
+                              type: boolean
+                            subdomain:
+                              description: |-
+                                If specified, the fully qualified Pod hostname will be "<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>".
+                                If not specified, the pod will not have a domainname at all.
+                              type: string
+                            terminationGracePeriodSeconds:
+                              description: |-
+                                Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request.
+                                Value must be non-negative integer. The value zero indicates stop immediately via
+                                the kill signal (no opportunity to shut down).
+                                If this value is nil, the default grace period will be used instead.
+                                The grace period is the duration in seconds after the processes running in the pod are sent
+                                a termination signal and the time when the processes are forcibly halted with a kill signal.
+                                Set this value longer than the expected cleanup time for your process.
+                                Defaults to 30 seconds.
+                              format: int64
+                              type: integer
+                            tolerations:
+                              description: If specified, the pod's tolerations.
+                              items:
+                                description: |-
+                                  The pod this Toleration is attached to tolerates any taint that matches
+                                  the triple <key,value,effect> using the matching operator <operator>.
+                                properties:
+                                  effect:
+                                    description: |-
+                                      Effect indicates the taint effect to match. Empty means match all taint effects.
+                                      When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                                    type: string
+                                  key:
+                                    description: |-
+                                      Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                                      If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                                    type: string
+                                  operator:
+                                    description: |-
+                                      Operator represents a key's relationship to the value.
+                                      Valid operators are Exists and Equal. Defaults to Equal.
+                                      Exists is equivalent to wildcard for value, so that a pod can
+                                      tolerate all taints of a particular category.
+                                    type: string
+                                  tolerationSeconds:
+                                    description: |-
+                                      TolerationSeconds represents the period of time the toleration (which must be
+                                      of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                                      it is not set, which means tolerate the taint forever (do not evict). Zero and
+                                      negative values will be treated as 0 (evict immediately) by the system.
+                                    format: int64
+                                    type: integer
+                                  value:
+                                    description: |-
+                                      Value is the taint value the toleration matches to.
+                                      If the operator is Exists, the value should be empty, otherwise just a regular string.
+                                    type: string
+                                type: object
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            topologySpreadConstraints:
+                              description: |-
+                                TopologySpreadConstraints describes how a group of pods ought to spread across topology
+                                domains. Scheduler will schedule pods in a way which abides by the constraints.
+                                All topologySpreadConstraints are ANDed.
+                              items:
+                                description: TopologySpreadConstraint specifies how
+                                  to spread matching pods among the given topology.
+                                properties:
+                                  labelSelector:
+                                    description: |-
+                                      LabelSelector is used to find matching pods.
+                                      Pods that match this label selector are counted to determine the number of pods
+                                      in their corresponding topology domain.
+                                    properties:
+                                      matchExpressions:
+                                        description: matchExpressions is a list of
+                                          label selector requirements. The requirements
+                                          are ANDed.
+                                        items:
+                                          description: |-
+                                            A label selector requirement is a selector that contains values, a key, and an operator that
+                                            relates the key and values.
+                                          properties:
+                                            key:
+                                              description: key is the label key that
+                                                the selector applies to.
+                                              type: string
+                                            operator:
+                                              description: |-
+                                                operator represents a key's relationship to a set of values.
+                                                Valid operators are In, NotIn, Exists and DoesNotExist.
+                                              type: string
+                                            values:
+                                              description: |-
+                                                values is an array of string values. If the operator is In or NotIn,
+                                                the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                the values array must be empty. This array is replaced during a strategic
+                                                merge patch.
+                                              items:
+                                                type: string
+                                              type: array
+                                              x-kubernetes-list-type: atomic
+                                          required:
+                                          - key
+                                          - operator
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      matchLabels:
+                                        additionalProperties:
+                                          type: string
+                                        description: |-
+                                          matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                          map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                          operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                        type: object
+                                    type: object
+                                    x-kubernetes-map-type: atomic
+                                  matchLabelKeys:
+                                    description: |-
+                                      MatchLabelKeys is a set of pod label keys to select the pods over which
+                                      spreading will be calculated. The keys are used to lookup values from the
+                                      incoming pod labels, those key-value labels are ANDed with labelSelector
+                                      to select the group of existing pods over which spreading will be calculated
+                                      for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector.
+                                      MatchLabelKeys cannot be set when LabelSelector isn't set.
+                                      Keys that don't exist in the incoming pod labels will
+                                      be ignored. A null or empty list means only match against labelSelector.
+
+                                      This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default).
+                                    items:
+                                      type: string
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  maxSkew:
+                                    description: |-
+                                      MaxSkew describes the degree to which pods may be unevenly distributed.
+                                      When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference
+                                      between the number of matching pods in the target topology and the global minimum.
+                                      The global minimum is the minimum number of matching pods in an eligible domain
+                                      or zero if the number of eligible domains is less than MinDomains.
+                                      For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same
+                                      labelSelector spread as 2/2/1:
+                                      In this case, the global minimum is 1.
+                                      | zone1 | zone2 | zone3 |
+                                      |  P P  |  P P  |   P   |
+                                      - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2;
+                                      scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2)
+                                      violate MaxSkew(1).
+                                      - if MaxSkew is 2, incoming pod can be scheduled onto any zone.
+                                      When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence
+                                      to topologies that satisfy it.
+                                      It's a required field. Default value is 1 and 0 is not allowed.
+                                    format: int32
+                                    type: integer
+                                  minDomains:
+                                    description: |-
+                                      MinDomains indicates a minimum number of eligible domains.
+                                      When the number of eligible domains with matching topology keys is less than minDomains,
+                                      Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed.
+                                      And when the number of eligible domains with matching topology keys equals or greater than minDomains,
+                                      this value has no effect on scheduling.
+                                      As a result, when the number of eligible domains is less than minDomains,
+                                      scheduler won't schedule more than maxSkew Pods to those domains.
+                                      If value is nil, the constraint behaves as if MinDomains is equal to 1.
+                                      Valid values are integers greater than 0.
+                                      When value is not nil, WhenUnsatisfiable must be DoNotSchedule.
+
+                                      For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same
+                                      labelSelector spread as 2/2/2:
+                                      | zone1 | zone2 | zone3 |
+                                      |  P P  |  P P  |  P P  |
+                                      The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0.
+                                      In this situation, new pod with the same labelSelector cannot be scheduled,
+                                      because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones,
+                                      it will violate MaxSkew.
+                                    format: int32
+                                    type: integer
+                                  nodeAffinityPolicy:
+                                    description: |-
+                                      NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector
+                                      when calculating pod topology spread skew. Options are:
+                                      - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations.
+                                      - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations.
+
+                                      If this value is nil, the behavior is equivalent to the Honor policy.
+                                      This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag.
+                                    type: string
+                                  nodeTaintsPolicy:
+                                    description: |-
+                                      NodeTaintsPolicy indicates how we will treat node taints when calculating
+                                      pod topology spread skew. Options are:
+                                      - Honor: nodes without taints, along with tainted nodes for which the incoming pod
+                                      has a toleration, are included.
+                                      - Ignore: node taints are ignored. All nodes are included.
+
+                                      If this value is nil, the behavior is equivalent to the Ignore policy.
+                                      This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag.
+                                    type: string
+                                  topologyKey:
+                                    description: |-
+                                      TopologyKey is the key of node labels. Nodes that have a label with this key
+                                      and identical values are considered to be in the same topology.
+                                      We consider each <key, value> as a "bucket", and try to put balanced number
+                                      of pods into each bucket.
+                                      We define a domain as a particular instance of a topology.
+                                      Also, we define an eligible domain as a domain whose nodes meet the requirements of
+                                      nodeAffinityPolicy and nodeTaintsPolicy.
+                                      e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology.
+                                      And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology.
+                                      It's a required field.
+                                    type: string
+                                  whenUnsatisfiable:
+                                    description: |-
+                                      WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy
+                                      the spread constraint.
+                                      - DoNotSchedule (default) tells the scheduler not to schedule it.
+                                      - ScheduleAnyway tells the scheduler to schedule the pod in any location,
+                                        but giving higher precedence to topologies that would help reduce the
+                                        skew.
+                                      A constraint is considered "Unsatisfiable" for an incoming pod
+                                      if and only if every possible node assignment for that pod would violate
+                                      "MaxSkew" on some topology.
+                                      For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same
+                                      labelSelector spread as 3/1/1:
+                                      | zone1 | zone2 | zone3 |
+                                      | P P P |   P   |   P   |
+                                      If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled
+                                      to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies
+                                      MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler
+                                      won't make it *more* imbalanced.
+                                      It's a required field.
+                                    type: string
+                                required:
+                                - maxSkew
+                                - topologyKey
+                                - whenUnsatisfiable
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - topologyKey
+                              - whenUnsatisfiable
+                              x-kubernetes-list-type: map
+                            volumes:
+                              description: |-
+                                List of volumes that can be mounted by containers belonging to the pod.
+                                More info: https://kubernetes.io/docs/concepts/storage/volumes
+                              items:
+                                description: Volume represents a named volume in a
+                                  pod that may be accessed by any container in the
+                                  pod.
+                                properties:
+                                  awsElasticBlockStore:
+                                    description: |-
+                                      awsElasticBlockStore represents an AWS Disk resource that is attached to a
+                                      kubelet's host machine and then exposed to the pod.
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type of the volume that you want to mount.
+                                          Tip: Ensure that the filesystem type is supported by the host operating system.
+                                          Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                                        type: string
+                                      partition:
+                                        description: |-
+                                          partition is the partition in the volume that you want to mount.
+                                          If omitted, the default is to mount by volume name.
+                                          Examples: For volume /dev/sda1, you specify the partition as "1".
+                                          Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty).
+                                        format: int32
+                                        type: integer
+                                      readOnly:
+                                        description: |-
+                                          readOnly value true will force the readOnly setting in VolumeMounts.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                                        type: boolean
+                                      volumeID:
+                                        description: |-
+                                          volumeID is unique ID of the persistent disk resource in AWS (Amazon EBS volume).
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                                        type: string
+                                    required:
+                                    - volumeID
+                                    type: object
+                                  azureDisk:
+                                    description: azureDisk represents an Azure Data
+                                      Disk mount on the host and bind mount to the
+                                      pod.
+                                    properties:
+                                      cachingMode:
+                                        description: 'cachingMode is the Host Caching
+                                          mode: None, Read Only, Read Write.'
+                                        type: string
+                                      diskName:
+                                        description: diskName is the Name of the data
+                                          disk in the blob storage
+                                        type: string
+                                      diskURI:
+                                        description: diskURI is the URI of data disk
+                                          in the blob storage
+                                        type: string
+                                      fsType:
+                                        default: ext4
+                                        description: |-
+                                          fsType is Filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      kind:
+                                        description: 'kind expected values are Shared:
+                                          multiple blob disks per storage account  Dedicated:
+                                          single blob disk per storage account  Managed:
+                                          azure managed data disk (only in managed
+                                          availability set). defaults to shared'
+                                        type: string
+                                      readOnly:
+                                        default: false
+                                        description: |-
+                                          readOnly Defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                    required:
+                                    - diskName
+                                    - diskURI
+                                    type: object
+                                  azureFile:
+                                    description: azureFile represents an Azure File
+                                      Service mount on the host and bind mount to
+                                      the pod.
+                                    properties:
+                                      readOnly:
+                                        description: |-
+                                          readOnly defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      secretName:
+                                        description: secretName is the  name of secret
+                                          that contains Azure Storage Account Name
+                                          and Key
+                                        type: string
+                                      shareName:
+                                        description: shareName is the azure share
+                                          Name
+                                        type: string
+                                    required:
+                                    - secretName
+                                    - shareName
+                                    type: object
+                                  cephfs:
+                                    description: cephFS represents a Ceph FS mount
+                                      on the host that shares a pod's lifetime
+                                    properties:
+                                      monitors:
+                                        description: |-
+                                          monitors is Required: Monitors is a collection of Ceph monitors
+                                          More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      path:
+                                        description: 'path is Optional: Used as the
+                                          mounted root, rather than the full Ceph
+                                          tree, default is /'
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly is Optional: Defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                          More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                                        type: boolean
+                                      secretFile:
+                                        description: |-
+                                          secretFile is Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret
+                                          More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                                        type: string
+                                      secretRef:
+                                        description: |-
+                                          secretRef is Optional: SecretRef is reference to the authentication secret for User, default is empty.
+                                          More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      user:
+                                        description: |-
+                                          user is optional: User is the rados user name, default is admin
+                                          More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                                        type: string
+                                    required:
+                                    - monitors
+                                    type: object
+                                  cinder:
+                                    description: |-
+                                      cinder represents a cinder volume attached and mounted on kubelets host machine.
+                                      More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                          More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                          More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                                        type: boolean
+                                      secretRef:
+                                        description: |-
+                                          secretRef is optional: points to a secret object containing parameters used to connect
+                                          to OpenStack.
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      volumeID:
+                                        description: |-
+                                          volumeID used to identify the volume in cinder.
+                                          More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                                        type: string
+                                    required:
+                                    - volumeID
+                                    type: object
+                                  configMap:
+                                    description: configMap represents a configMap
+                                      that should populate this volume
+                                    properties:
+                                      defaultMode:
+                                        description: |-
+                                          defaultMode is optional: mode bits used to set permissions on created files by default.
+                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                          Defaults to 0644.
+                                          Directories within the path are not affected by this setting.
+                                          This might be in conflict with other options that affect the file
+                                          mode, like fsGroup, and the result can be other mode bits set.
+                                        format: int32
+                                        type: integer
+                                      items:
+                                        description: |-
+                                          items if unspecified, each key-value pair in the Data field of the referenced
+                                          ConfigMap will be projected into the volume as a file whose name is the
+                                          key and content is the value. If specified, the listed keys will be
+                                          projected into the specified paths, and unlisted keys will not be
+                                          present. If a key is specified which is not present in the ConfigMap,
+                                          the volume setup will error unless it is marked optional. Paths must be
+                                          relative and may not contain the '..' path or start with '..'.
+                                        items:
+                                          description: Maps a string key to a path
+                                            within a volume.
+                                          properties:
+                                            key:
+                                              description: key is the key to project.
+                                              type: string
+                                            mode:
+                                              description: |-
+                                                mode is Optional: mode bits used to set permissions on this file.
+                                                Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                                YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                If not specified, the volume defaultMode will be used.
+                                                This might be in conflict with other options that affect the file
+                                                mode, like fsGroup, and the result can be other mode bits set.
+                                              format: int32
+                                              type: integer
+                                            path:
+                                              description: |-
+                                                path is the relative path of the file to map the key to.
+                                                May not be an absolute path.
+                                                May not contain the path element '..'.
+                                                May not start with the string '..'.
+                                              type: string
+                                          required:
+                                          - key
+                                          - path
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      name:
+                                        default: ""
+                                        description: |-
+                                          Name of the referent.
+                                          This field is effectively required, but due to backwards compatibility is
+                                          allowed to be empty. Instances of this type with an empty value here are
+                                          almost certainly wrong.
+                                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                        type: string
+                                      optional:
+                                        description: optional specify whether the
+                                          ConfigMap or its keys must be defined
+                                        type: boolean
+                                    type: object
+                                    x-kubernetes-map-type: atomic
+                                  csi:
+                                    description: csi (Container Storage Interface)
+                                      represents ephemeral storage that is handled
+                                      by certain external CSI drivers (Beta feature).
+                                    properties:
+                                      driver:
+                                        description: |-
+                                          driver is the name of the CSI driver that handles this volume.
+                                          Consult with your admin for the correct name as registered in the cluster.
+                                        type: string
+                                      fsType:
+                                        description: |-
+                                          fsType to mount. Ex. "ext4", "xfs", "ntfs".
+                                          If not provided, the empty value is passed to the associated CSI driver
+                                          which will determine the default filesystem to apply.
+                                        type: string
+                                      nodePublishSecretRef:
+                                        description: |-
+                                          nodePublishSecretRef is a reference to the secret object containing
+                                          sensitive information to pass to the CSI driver to complete the CSI
+                                          NodePublishVolume and NodeUnpublishVolume calls.
+                                          This field is optional, and  may be empty if no secret is required. If the
+                                          secret object contains more than one secret, all secret references are passed.
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      readOnly:
+                                        description: |-
+                                          readOnly specifies a read-only configuration for the volume.
+                                          Defaults to false (read/write).
+                                        type: boolean
+                                      volumeAttributes:
+                                        additionalProperties:
+                                          type: string
+                                        description: |-
+                                          volumeAttributes stores driver-specific properties that are passed to the CSI
+                                          driver. Consult your driver's documentation for supported values.
+                                        type: object
+                                    required:
+                                    - driver
+                                    type: object
+                                  downwardAPI:
+                                    description: downwardAPI represents downward API
+                                      about the pod that should populate this volume
+                                    properties:
+                                      defaultMode:
+                                        description: |-
+                                          Optional: mode bits to use on created files by default. Must be a
+                                          Optional: mode bits used to set permissions on created files by default.
+                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                          Defaults to 0644.
+                                          Directories within the path are not affected by this setting.
+                                          This might be in conflict with other options that affect the file
+                                          mode, like fsGroup, and the result can be other mode bits set.
+                                        format: int32
+                                        type: integer
+                                      items:
+                                        description: Items is a list of downward API
+                                          volume file
+                                        items:
+                                          description: DownwardAPIVolumeFile represents
+                                            information to create the file containing
+                                            the pod field
+                                          properties:
+                                            fieldRef:
+                                              description: 'Required: Selects a field
+                                                of the pod: only annotations, labels,
+                                                name, namespace and uid are supported.'
+                                              properties:
+                                                apiVersion:
+                                                  description: Version of the schema
+                                                    the FieldPath is written in terms
+                                                    of, defaults to "v1".
+                                                  type: string
+                                                fieldPath:
+                                                  description: Path of the field to
+                                                    select in the specified API version.
+                                                  type: string
+                                              required:
+                                              - fieldPath
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            mode:
+                                              description: |-
+                                                Optional: mode bits used to set permissions on this file, must be an octal value
+                                                between 0000 and 0777 or a decimal value between 0 and 511.
+                                                YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                If not specified, the volume defaultMode will be used.
+                                                This might be in conflict with other options that affect the file
+                                                mode, like fsGroup, and the result can be other mode bits set.
+                                              format: int32
+                                              type: integer
+                                            path:
+                                              description: 'Required: Path is  the
+                                                relative path name of the file to
+                                                be created. Must not be absolute or
+                                                contain the ''..'' path. Must be utf-8
+                                                encoded. The first item of the relative
+                                                path must not start with ''..'''
+                                              type: string
+                                            resourceFieldRef:
+                                              description: |-
+                                                Selects a resource of the container: only resources limits and requests
+                                                (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.
+                                              properties:
+                                                containerName:
+                                                  description: 'Container name: required
+                                                    for volumes, optional for env
+                                                    vars'
+                                                  type: string
+                                                divisor:
+                                                  anyOf:
+                                                  - type: integer
+                                                  - type: string
+                                                  description: Specifies the output
+                                                    format of the exposed resources,
+                                                    defaults to "1"
+                                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                  x-kubernetes-int-or-string: true
+                                                resource:
+                                                  description: 'Required: resource
+                                                    to select'
+                                                  type: string
+                                              required:
+                                              - resource
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                          required:
+                                          - path
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    type: object
+                                  emptyDir:
+                                    description: |-
+                                      emptyDir represents a temporary directory that shares a pod's lifetime.
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                                    properties:
+                                      medium:
+                                        description: |-
+                                          medium represents what type of storage medium should back this directory.
+                                          The default is "" which means to use the node's default medium.
+                                          Must be an empty string (default) or Memory.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                                        type: string
+                                      sizeLimit:
+                                        anyOf:
+                                        - type: integer
+                                        - type: string
+                                        description: |-
+                                          sizeLimit is the total amount of local storage required for this EmptyDir volume.
+                                          The size limit is also applicable for memory medium.
+                                          The maximum usage on memory medium EmptyDir would be the minimum value between
+                                          the SizeLimit specified here and the sum of memory limits of all containers in a pod.
+                                          The default is nil which means that the limit is undefined.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                        x-kubernetes-int-or-string: true
+                                    type: object
+                                  ephemeral:
+                                    description: |-
+                                      ephemeral represents a volume that is handled by a cluster storage driver.
+                                      The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts,
+                                      and deleted when the pod is removed.
+
+                                      Use this if:
+                                      a) the volume is only needed while the pod runs,
+                                      b) features of normal volumes like restoring from snapshot or capacity
+                                         tracking are needed,
+                                      c) the storage driver is specified through a storage class, and
+                                      d) the storage driver supports dynamic volume provisioning through
+                                         a PersistentVolumeClaim (see EphemeralVolumeSource for more
+                                         information on the connection between this volume type
+                                         and PersistentVolumeClaim).
+
+                                      Use PersistentVolumeClaim or one of the vendor-specific
+                                      APIs for volumes that persist for longer than the lifecycle
+                                      of an individual pod.
+
+                                      Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to
+                                      be used that way - see the documentation of the driver for
+                                      more information.
+
+                                      A pod can use both types of ephemeral volumes and
+                                      persistent volumes at the same time.
+                                    properties:
+                                      volumeClaimTemplate:
+                                        description: |-
+                                          Will be used to create a stand-alone PVC to provision the volume.
+                                          The pod in which this EphemeralVolumeSource is embedded will be the
+                                          owner of the PVC, i.e. the PVC will be deleted together with the
+                                          pod.  The name of the PVC will be `<pod name>-<volume name>` where
+                                          `<volume name>` is the name from the `PodSpec.Volumes` array
+                                          entry. Pod validation will reject the pod if the concatenated name
+                                          is not valid for a PVC (for example, too long).
+
+                                          An existing PVC with that name that is not owned by the pod
+                                          will *not* be used for the pod to avoid using an unrelated
+                                          volume by mistake. Starting the pod is then blocked until
+                                          the unrelated PVC is removed. If such a pre-created PVC is
+                                          meant to be used by the pod, the PVC has to updated with an
+                                          owner reference to the pod once the pod exists. Normally
+                                          this should not be necessary, but it may be useful when
+                                          manually reconstructing a broken cluster.
+
+                                          This field is read-only and no changes will be made by Kubernetes
+                                          to the PVC after it has been created.
+
+                                          Required, must not be nil.
+                                        properties:
+                                          metadata:
+                                            description: |-
+                                              May contain labels and annotations that will be copied into the PVC
+                                              when creating it. No other fields are allowed and will be rejected during
+                                              validation.
+                                            properties:
+                                              annotations:
+                                                additionalProperties:
+                                                  type: string
+                                                type: object
+                                              finalizers:
+                                                items:
+                                                  type: string
+                                                type: array
+                                              labels:
+                                                additionalProperties:
+                                                  type: string
+                                                type: object
+                                              name:
+                                                type: string
+                                              namespace:
+                                                type: string
+                                            type: object
+                                          spec:
+                                            description: |-
+                                              The specification for the PersistentVolumeClaim. The entire content is
+                                              copied unchanged into the PVC that gets created from this
+                                              template. The same fields as in a PersistentVolumeClaim
+                                              are also valid here.
+                                            properties:
+                                              accessModes:
+                                                description: |-
+                                                  accessModes contains the desired access modes the volume should have.
+                                                  More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1
+                                                items:
+                                                  type: string
+                                                type: array
+                                                x-kubernetes-list-type: atomic
+                                              dataSource:
+                                                description: |-
+                                                  dataSource field can be used to specify either:
+                                                  * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot)
+                                                  * An existing PVC (PersistentVolumeClaim)
+                                                  If the provisioner or an external controller can support the specified data source,
+                                                  it will create a new volume based on the contents of the specified data source.
+                                                  When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef,
+                                                  and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified.
+                                                  If the namespace is specified, then dataSourceRef will not be copied to dataSource.
+                                                properties:
+                                                  apiGroup:
+                                                    description: |-
+                                                      APIGroup is the group for the resource being referenced.
+                                                      If APIGroup is not specified, the specified Kind must be in the core API group.
+                                                      For any other third-party types, APIGroup is required.
+                                                    type: string
+                                                  kind:
+                                                    description: Kind is the type
+                                                      of resource being referenced
+                                                    type: string
+                                                  name:
+                                                    description: Name is the name
+                                                      of resource being referenced
+                                                    type: string
+                                                required:
+                                                - kind
+                                                - name
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              dataSourceRef:
+                                                description: |-
+                                                  dataSourceRef specifies the object from which to populate the volume with data, if a non-empty
+                                                  volume is desired. This may be any object from a non-empty API group (non
+                                                  core object) or a PersistentVolumeClaim object.
+                                                  When this field is specified, volume binding will only succeed if the type of
+                                                  the specified object matches some installed volume populator or dynamic
+                                                  provisioner.
+                                                  This field will replace the functionality of the dataSource field and as such
+                                                  if both fields are non-empty, they must have the same value. For backwards
+                                                  compatibility, when namespace isn't specified in dataSourceRef,
+                                                  both fields (dataSource and dataSourceRef) will be set to the same
+                                                  value automatically if one of them is empty and the other is non-empty.
+                                                  When namespace is specified in dataSourceRef,
+                                                  dataSource isn't set to the same value and must be empty.
+                                                  There are three important differences between dataSource and dataSourceRef:
+                                                  * While dataSource only allows two specific types of objects, dataSourceRef
+                                                    allows any non-core object, as well as PersistentVolumeClaim objects.
+                                                  * While dataSource ignores disallowed values (dropping them), dataSourceRef
+                                                    preserves all values, and generates an error if a disallowed value is
+                                                    specified.
+                                                  * While dataSource only allows local objects, dataSourceRef allows objects
+                                                    in any namespaces.
+                                                  (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled.
+                                                  (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
+                                                properties:
+                                                  apiGroup:
+                                                    description: |-
+                                                      APIGroup is the group for the resource being referenced.
+                                                      If APIGroup is not specified, the specified Kind must be in the core API group.
+                                                      For any other third-party types, APIGroup is required.
+                                                    type: string
+                                                  kind:
+                                                    description: Kind is the type
+                                                      of resource being referenced
+                                                    type: string
+                                                  name:
+                                                    description: Name is the name
+                                                      of resource being referenced
+                                                    type: string
+                                                  namespace:
+                                                    description: |-
+                                                      Namespace is the namespace of resource being referenced
+                                                      Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details.
+                                                      (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
+                                                    type: string
+                                                required:
+                                                - kind
+                                                - name
+                                                type: object
+                                              resources:
+                                                description: |-
+                                                  resources represents the minimum resources the volume should have.
+                                                  If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements
+                                                  that are lower than previous value but must still be higher than capacity recorded in the
+                                                  status field of the claim.
+                                                  More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources
+                                                properties:
+                                                  limits:
+                                                    additionalProperties:
+                                                      anyOf:
+                                                      - type: integer
+                                                      - type: string
+                                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                      x-kubernetes-int-or-string: true
+                                                    description: |-
+                                                      Limits describes the maximum amount of compute resources allowed.
+                                                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                                    type: object
+                                                  requests:
+                                                    additionalProperties:
+                                                      anyOf:
+                                                      - type: integer
+                                                      - type: string
+                                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                      x-kubernetes-int-or-string: true
+                                                    description: |-
+                                                      Requests describes the minimum amount of compute resources required.
+                                                      If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                                      otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                                    type: object
+                                                type: object
+                                              selector:
+                                                description: selector is a label query
+                                                  over volumes to consider for binding.
+                                                properties:
+                                                  matchExpressions:
+                                                    description: matchExpressions
+                                                      is a list of label selector
+                                                      requirements. The requirements
+                                                      are ANDed.
+                                                    items:
+                                                      description: |-
+                                                        A label selector requirement is a selector that contains values, a key, and an operator that
+                                                        relates the key and values.
+                                                      properties:
+                                                        key:
+                                                          description: key is the
+                                                            label key that the selector
+                                                            applies to.
+                                                          type: string
+                                                        operator:
+                                                          description: |-
+                                                            operator represents a key's relationship to a set of values.
+                                                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                          type: string
+                                                        values:
+                                                          description: |-
+                                                            values is an array of string values. If the operator is In or NotIn,
+                                                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                            the values array must be empty. This array is replaced during a strategic
+                                                            merge patch.
+                                                          items:
+                                                            type: string
+                                                          type: array
+                                                          x-kubernetes-list-type: atomic
+                                                      required:
+                                                      - key
+                                                      - operator
+                                                      type: object
+                                                    type: array
+                                                    x-kubernetes-list-type: atomic
+                                                  matchLabels:
+                                                    additionalProperties:
+                                                      type: string
+                                                    description: |-
+                                                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                    type: object
+                                                type: object
+                                                x-kubernetes-map-type: atomic
+                                              storageClassName:
+                                                description: |-
+                                                  storageClassName is the name of the StorageClass required by the claim.
+                                                  More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1
+                                                type: string
+                                              volumeAttributesClassName:
+                                                description: |-
+                                                  volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim.
+                                                  If specified, the CSI driver will create or update the volume with the attributes defined
+                                                  in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName,
+                                                  it can be changed after the claim is created. An empty string value means that no VolumeAttributesClass
+                                                  will be applied to the claim but it's not allowed to reset this field to empty string once it is set.
+                                                  If unspecified and the PersistentVolumeClaim is unbound, the default VolumeAttributesClass
+                                                  will be set by the persistentvolume controller if it exists.
+                                                  If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be
+                                                  set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource
+                                                  exists.
+                                                  More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/
+                                                  (Beta) Using this field requires the VolumeAttributesClass feature gate to be enabled (off by default).
+                                                type: string
+                                              volumeMode:
+                                                description: |-
+                                                  volumeMode defines what type of volume is required by the claim.
+                                                  Value of Filesystem is implied when not included in claim spec.
+                                                type: string
+                                              volumeName:
+                                                description: volumeName is the binding
+                                                  reference to the PersistentVolume
+                                                  backing this claim.
+                                                type: string
+                                            type: object
+                                        required:
+                                        - spec
+                                        type: object
+                                    type: object
+                                  fc:
+                                    description: fc represents a Fibre Channel resource
+                                      that is attached to a kubelet's host machine
+                                      and then exposed to the pod.
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      lun:
+                                        description: 'lun is Optional: FC target lun
+                                          number'
+                                        format: int32
+                                        type: integer
+                                      readOnly:
+                                        description: |-
+                                          readOnly is Optional: Defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      targetWWNs:
+                                        description: 'targetWWNs is Optional: FC target
+                                          worldwide names (WWNs)'
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      wwids:
+                                        description: |-
+                                          wwids Optional: FC volume world wide identifiers (wwids)
+                                          Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    type: object
+                                  flexVolume:
+                                    description: |-
+                                      flexVolume represents a generic volume resource that is
+                                      provisioned/attached using an exec based plugin.
+                                    properties:
+                                      driver:
+                                        description: driver is the name of the driver
+                                          to use for this volume.
+                                        type: string
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script.
+                                        type: string
+                                      options:
+                                        additionalProperties:
+                                          type: string
+                                        description: 'options is Optional: this field
+                                          holds extra command options if any.'
+                                        type: object
+                                      readOnly:
+                                        description: |-
+                                          readOnly is Optional: defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      secretRef:
+                                        description: |-
+                                          secretRef is Optional: secretRef is reference to the secret object containing
+                                          sensitive information to pass to the plugin scripts. This may be
+                                          empty if no secret object is specified. If the secret object
+                                          contains more than one secret, all secrets are passed to the plugin
+                                          scripts.
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                    required:
+                                    - driver
+                                    type: object
+                                  flocker:
+                                    description: flocker represents a Flocker volume
+                                      attached to a kubelet's host machine. This depends
+                                      on the Flocker control service being running
+                                    properties:
+                                      datasetName:
+                                        description: |-
+                                          datasetName is Name of the dataset stored as metadata -> name on the dataset for Flocker
+                                          should be considered as deprecated
+                                        type: string
+                                      datasetUUID:
+                                        description: datasetUUID is the UUID of the
+                                          dataset. This is unique identifier of a
+                                          Flocker dataset
+                                        type: string
+                                    type: object
+                                  gcePersistentDisk:
+                                    description: |-
+                                      gcePersistentDisk represents a GCE Disk resource that is attached to a
+                                      kubelet's host machine and then exposed to the pod.
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is filesystem type of the volume that you want to mount.
+                                          Tip: Ensure that the filesystem type is supported by the host operating system.
+                                          Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                                        type: string
+                                      partition:
+                                        description: |-
+                                          partition is the partition in the volume that you want to mount.
+                                          If omitted, the default is to mount by volume name.
+                                          Examples: For volume /dev/sda1, you specify the partition as "1".
+                                          Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty).
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                                        format: int32
+                                        type: integer
+                                      pdName:
+                                        description: |-
+                                          pdName is unique name of the PD resource in GCE. Used to identify the disk in GCE.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the ReadOnly setting in VolumeMounts.
+                                          Defaults to false.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                                        type: boolean
+                                    required:
+                                    - pdName
+                                    type: object
+                                  gitRepo:
+                                    description: |-
+                                      gitRepo represents a git repository at a particular revision.
+                                      DEPRECATED: GitRepo is deprecated. To provision a container with a git repo, mount an
+                                      EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir
+                                      into the Pod's container.
+                                    properties:
+                                      directory:
+                                        description: |-
+                                          directory is the target directory name.
+                                          Must not contain or start with '..'.  If '.' is supplied, the volume directory will be the
+                                          git repository.  Otherwise, if specified, the volume will contain the git repository in
+                                          the subdirectory with the given name.
+                                        type: string
+                                      repository:
+                                        description: repository is the URL
+                                        type: string
+                                      revision:
+                                        description: revision is the commit hash for
+                                          the specified revision.
+                                        type: string
+                                    required:
+                                    - repository
+                                    type: object
+                                  glusterfs:
+                                    description: |-
+                                      glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime.
+                                      More info: https://examples.k8s.io/volumes/glusterfs/README.md
+                                    properties:
+                                      endpoints:
+                                        description: |-
+                                          endpoints is the endpoint name that details Glusterfs topology.
+                                          More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod
+                                        type: string
+                                      path:
+                                        description: |-
+                                          path is the Glusterfs volume path.
+                                          More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the Glusterfs volume to be mounted with read-only permissions.
+                                          Defaults to false.
+                                          More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod
+                                        type: boolean
+                                    required:
+                                    - endpoints
+                                    - path
+                                    type: object
+                                  hostPath:
+                                    description: |-
+                                      hostPath represents a pre-existing file or directory on the host
+                                      machine that is directly exposed to the container. This is generally
+                                      used for system agents or other privileged things that are allowed
+                                      to see the host machine. Most containers will NOT need this.
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                                    properties:
+                                      path:
+                                        description: |-
+                                          path of the directory on the host.
+                                          If the path is a symlink, it will follow the link to the real path.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                                        type: string
+                                      type:
+                                        description: |-
+                                          type for HostPath Volume
+                                          Defaults to ""
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                                        type: string
+                                    required:
+                                    - path
+                                    type: object
+                                  image:
+                                    description: |-
+                                      image represents an OCI object (a container image or artifact) pulled and mounted on the kubelet's host machine.
+                                      The volume is resolved at pod startup depending on which PullPolicy value is provided:
+
+                                      - Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails.
+                                      - Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present.
+                                      - IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails.
+
+                                      The volume gets re-resolved if the pod gets deleted and recreated, which means that new remote content will become available on pod recreation.
+                                      A failure to resolve or pull the image during pod startup will block containers from starting and may add significant latency. Failures will be retried using normal volume backoff and will be reported on the pod reason and message.
+                                      The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field.
+                                      The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images.
+                                      The volume will be mounted read-only (ro) and non-executable files (noexec).
+                                      Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath).
+                                      The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type.
+                                    properties:
+                                      pullPolicy:
+                                        description: |-
+                                          Policy for pulling OCI objects. Possible values are:
+                                          Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails.
+                                          Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present.
+                                          IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails.
+                                          Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.
+                                        type: string
+                                      reference:
+                                        description: |-
+                                          Required: Image or artifact reference to be used.
+                                          Behaves in the same way as pod.spec.containers[*].image.
+                                          Pull secrets will be assembled in the same way as for the container image by looking up node credentials, SA image pull secrets, and pod spec image pull secrets.
+                                          More info: https://kubernetes.io/docs/concepts/containers/images
+                                          This field is optional to allow higher level config management to default or override
+                                          container images in workload controllers like Deployments and StatefulSets.
+                                        type: string
+                                    type: object
+                                  iscsi:
+                                    description: |-
+                                      iscsi represents an ISCSI Disk resource that is attached to a
+                                      kubelet's host machine and then exposed to the pod.
+                                      More info: https://examples.k8s.io/volumes/iscsi/README.md
+                                    properties:
+                                      chapAuthDiscovery:
+                                        description: chapAuthDiscovery defines whether
+                                          support iSCSI Discovery CHAP authentication
+                                        type: boolean
+                                      chapAuthSession:
+                                        description: chapAuthSession defines whether
+                                          support iSCSI Session CHAP authentication
+                                        type: boolean
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type of the volume that you want to mount.
+                                          Tip: Ensure that the filesystem type is supported by the host operating system.
+                                          Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi
+                                        type: string
+                                      initiatorName:
+                                        description: |-
+                                          initiatorName is the custom iSCSI Initiator Name.
+                                          If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface
+                                          <target portal>:<volume name> will be created for the connection.
+                                        type: string
+                                      iqn:
+                                        description: iqn is the target iSCSI Qualified
+                                          Name.
+                                        type: string
+                                      iscsiInterface:
+                                        default: default
+                                        description: |-
+                                          iscsiInterface is the interface Name that uses an iSCSI transport.
+                                          Defaults to 'default' (tcp).
+                                        type: string
+                                      lun:
+                                        description: lun represents iSCSI Target Lun
+                                          number.
+                                        format: int32
+                                        type: integer
+                                      portals:
+                                        description: |-
+                                          portals is the iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port
+                                          is other than default (typically TCP ports 860 and 3260).
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the ReadOnly setting in VolumeMounts.
+                                          Defaults to false.
+                                        type: boolean
+                                      secretRef:
+                                        description: secretRef is the CHAP Secret
+                                          for iSCSI target and initiator authentication
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      targetPortal:
+                                        description: |-
+                                          targetPortal is iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port
+                                          is other than default (typically TCP ports 860 and 3260).
+                                        type: string
+                                    required:
+                                    - iqn
+                                    - lun
+                                    - targetPortal
+                                    type: object
+                                  name:
+                                    description: |-
+                                      name of the volume.
+                                      Must be a DNS_LABEL and unique within the pod.
+                                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                    type: string
+                                  nfs:
+                                    description: |-
+                                      nfs represents an NFS mount on the host that shares a pod's lifetime
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                                    properties:
+                                      path:
+                                        description: |-
+                                          path that is exported by the NFS server.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the NFS export to be mounted with read-only permissions.
+                                          Defaults to false.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                                        type: boolean
+                                      server:
+                                        description: |-
+                                          server is the hostname or IP address of the NFS server.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                                        type: string
+                                    required:
+                                    - path
+                                    - server
+                                    type: object
+                                  persistentVolumeClaim:
+                                    description: |-
+                                      persistentVolumeClaimVolumeSource represents a reference to a
+                                      PersistentVolumeClaim in the same namespace.
+                                      More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims
+                                    properties:
+                                      claimName:
+                                        description: |-
+                                          claimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume.
+                                          More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly Will force the ReadOnly setting in VolumeMounts.
+                                          Default false.
+                                        type: boolean
+                                    required:
+                                    - claimName
+                                    type: object
+                                  photonPersistentDisk:
+                                    description: photonPersistentDisk represents a
+                                      PhotonController persistent disk attached and
+                                      mounted on kubelets host machine
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      pdID:
+                                        description: pdID is the ID that identifies
+                                          Photon Controller persistent disk
+                                        type: string
+                                    required:
+                                    - pdID
+                                    type: object
+                                  portworxVolume:
+                                    description: portworxVolume represents a portworx
+                                      volume attached and mounted on kubelets host
+                                      machine
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fSType represents the filesystem type to mount
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      volumeID:
+                                        description: volumeID uniquely identifies
+                                          a Portworx volume
+                                        type: string
+                                    required:
+                                    - volumeID
+                                    type: object
+                                  projected:
+                                    description: projected items for all in one resources
+                                      secrets, configmaps, and downward API
+                                    properties:
+                                      defaultMode:
+                                        description: |-
+                                          defaultMode are the mode bits used to set permissions on created files by default.
+                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                          Directories within the path are not affected by this setting.
+                                          This might be in conflict with other options that affect the file
+                                          mode, like fsGroup, and the result can be other mode bits set.
+                                        format: int32
+                                        type: integer
+                                      sources:
+                                        description: |-
+                                          sources is the list of volume projections. Each entry in this list
+                                          handles one source.
+                                        items:
+                                          description: |-
+                                            Projection that may be projected along with other supported volume types.
+                                            Exactly one of these fields must be set.
+                                          properties:
+                                            clusterTrustBundle:
+                                              description: |-
+                                                ClusterTrustBundle allows a pod to access the `.spec.trustBundle` field
+                                                of ClusterTrustBundle objects in an auto-updating file.
+
+                                                Alpha, gated by the ClusterTrustBundleProjection feature gate.
+
+                                                ClusterTrustBundle objects can either be selected by name, or by the
+                                                combination of signer name and a label selector.
+
+                                                Kubelet performs aggressive normalization of the PEM contents written
+                                                into the pod filesystem.  Esoteric PEM features such as inter-block
+                                                comments and block headers are stripped.  Certificates are deduplicated.
+                                                The ordering of certificates within the file is arbitrary, and Kubelet
+                                                may change the order over time.
+                                              properties:
+                                                labelSelector:
+                                                  description: |-
+                                                    Select all ClusterTrustBundles that match this label selector.  Only has
+                                                    effect if signerName is set.  Mutually-exclusive with name.  If unset,
+                                                    interpreted as "match nothing".  If set but empty, interpreted as "match
+                                                    everything".
+                                                  properties:
+                                                    matchExpressions:
+                                                      description: matchExpressions
+                                                        is a list of label selector
+                                                        requirements. The requirements
+                                                        are ANDed.
+                                                      items:
+                                                        description: |-
+                                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                                          relates the key and values.
+                                                        properties:
+                                                          key:
+                                                            description: key is the
+                                                              label key that the selector
+                                                              applies to.
+                                                            type: string
+                                                          operator:
+                                                            description: |-
+                                                              operator represents a key's relationship to a set of values.
+                                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                                            type: string
+                                                          values:
+                                                            description: |-
+                                                              values is an array of string values. If the operator is In or NotIn,
+                                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                              the values array must be empty. This array is replaced during a strategic
+                                                              merge patch.
+                                                            items:
+                                                              type: string
+                                                            type: array
+                                                            x-kubernetes-list-type: atomic
+                                                        required:
+                                                        - key
+                                                        - operator
+                                                        type: object
+                                                      type: array
+                                                      x-kubernetes-list-type: atomic
+                                                    matchLabels:
+                                                      additionalProperties:
+                                                        type: string
+                                                      description: |-
+                                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                                      type: object
+                                                  type: object
+                                                  x-kubernetes-map-type: atomic
+                                                name:
+                                                  description: |-
+                                                    Select a single ClusterTrustBundle by object name.  Mutually-exclusive
+                                                    with signerName and labelSelector.
+                                                  type: string
+                                                optional:
+                                                  description: |-
+                                                    If true, don't block pod startup if the referenced ClusterTrustBundle(s)
+                                                    aren't available.  If using name, then the named ClusterTrustBundle is
+                                                    allowed not to exist.  If using signerName, then the combination of
+                                                    signerName and labelSelector is allowed to match zero
+                                                    ClusterTrustBundles.
+                                                  type: boolean
+                                                path:
+                                                  description: Relative path from
+                                                    the volume root to write the bundle.
+                                                  type: string
+                                                signerName:
+                                                  description: |-
+                                                    Select all ClusterTrustBundles that match this signer name.
+                                                    Mutually-exclusive with name.  The contents of all selected
+                                                    ClusterTrustBundles will be unified and deduplicated.
+                                                  type: string
+                                              required:
+                                              - path
+                                              type: object
+                                            configMap:
+                                              description: configMap information about
+                                                the configMap data to project
+                                              properties:
+                                                items:
+                                                  description: |-
+                                                    items if unspecified, each key-value pair in the Data field of the referenced
+                                                    ConfigMap will be projected into the volume as a file whose name is the
+                                                    key and content is the value. If specified, the listed keys will be
+                                                    projected into the specified paths, and unlisted keys will not be
+                                                    present. If a key is specified which is not present in the ConfigMap,
+                                                    the volume setup will error unless it is marked optional. Paths must be
+                                                    relative and may not contain the '..' path or start with '..'.
+                                                  items:
+                                                    description: Maps a string key
+                                                      to a path within a volume.
+                                                    properties:
+                                                      key:
+                                                        description: key is the key
+                                                          to project.
+                                                        type: string
+                                                      mode:
+                                                        description: |-
+                                                          mode is Optional: mode bits used to set permissions on this file.
+                                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                          If not specified, the volume defaultMode will be used.
+                                                          This might be in conflict with other options that affect the file
+                                                          mode, like fsGroup, and the result can be other mode bits set.
+                                                        format: int32
+                                                        type: integer
+                                                      path:
+                                                        description: |-
+                                                          path is the relative path of the file to map the key to.
+                                                          May not be an absolute path.
+                                                          May not contain the path element '..'.
+                                                          May not start with the string '..'.
+                                                        type: string
+                                                    required:
+                                                    - key
+                                                    - path
+                                                    type: object
+                                                  type: array
+                                                  x-kubernetes-list-type: atomic
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: optional specify whether
+                                                    the ConfigMap or its keys must
+                                                    be defined
+                                                  type: boolean
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            downwardAPI:
+                                              description: downwardAPI information
+                                                about the downwardAPI data to project
+                                              properties:
+                                                items:
+                                                  description: Items is a list of
+                                                    DownwardAPIVolume file
+                                                  items:
+                                                    description: DownwardAPIVolumeFile
+                                                      represents information to create
+                                                      the file containing the pod
+                                                      field
+                                                    properties:
+                                                      fieldRef:
+                                                        description: 'Required: Selects
+                                                          a field of the pod: only
+                                                          annotations, labels, name,
+                                                          namespace and uid are supported.'
+                                                        properties:
+                                                          apiVersion:
+                                                            description: Version of
+                                                              the schema the FieldPath
+                                                              is written in terms
+                                                              of, defaults to "v1".
+                                                            type: string
+                                                          fieldPath:
+                                                            description: Path of the
+                                                              field to select in the
+                                                              specified API version.
+                                                            type: string
+                                                        required:
+                                                        - fieldPath
+                                                        type: object
+                                                        x-kubernetes-map-type: atomic
+                                                      mode:
+                                                        description: |-
+                                                          Optional: mode bits used to set permissions on this file, must be an octal value
+                                                          between 0000 and 0777 or a decimal value between 0 and 511.
+                                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                          If not specified, the volume defaultMode will be used.
+                                                          This might be in conflict with other options that affect the file
+                                                          mode, like fsGroup, and the result can be other mode bits set.
+                                                        format: int32
+                                                        type: integer
+                                                      path:
+                                                        description: 'Required: Path
+                                                          is  the relative path name
+                                                          of the file to be created.
+                                                          Must not be absolute or
+                                                          contain the ''..'' path.
+                                                          Must be utf-8 encoded. The
+                                                          first item of the relative
+                                                          path must not start with
+                                                          ''..'''
+                                                        type: string
+                                                      resourceFieldRef:
+                                                        description: |-
+                                                          Selects a resource of the container: only resources limits and requests
+                                                          (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.
+                                                        properties:
+                                                          containerName:
+                                                            description: 'Container
+                                                              name: required for volumes,
+                                                              optional for env vars'
+                                                            type: string
+                                                          divisor:
+                                                            anyOf:
+                                                            - type: integer
+                                                            - type: string
+                                                            description: Specifies
+                                                              the output format of
+                                                              the exposed resources,
+                                                              defaults to "1"
+                                                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                                            x-kubernetes-int-or-string: true
+                                                          resource:
+                                                            description: 'Required:
+                                                              resource to select'
+                                                            type: string
+                                                        required:
+                                                        - resource
+                                                        type: object
+                                                        x-kubernetes-map-type: atomic
+                                                    required:
+                                                    - path
+                                                    type: object
+                                                  type: array
+                                                  x-kubernetes-list-type: atomic
+                                              type: object
+                                            secret:
+                                              description: secret information about
+                                                the secret data to project
+                                              properties:
+                                                items:
+                                                  description: |-
+                                                    items if unspecified, each key-value pair in the Data field of the referenced
+                                                    Secret will be projected into the volume as a file whose name is the
+                                                    key and content is the value. If specified, the listed keys will be
+                                                    projected into the specified paths, and unlisted keys will not be
+                                                    present. If a key is specified which is not present in the Secret,
+                                                    the volume setup will error unless it is marked optional. Paths must be
+                                                    relative and may not contain the '..' path or start with '..'.
+                                                  items:
+                                                    description: Maps a string key
+                                                      to a path within a volume.
+                                                    properties:
+                                                      key:
+                                                        description: key is the key
+                                                          to project.
+                                                        type: string
+                                                      mode:
+                                                        description: |-
+                                                          mode is Optional: mode bits used to set permissions on this file.
+                                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                                          YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                          If not specified, the volume defaultMode will be used.
+                                                          This might be in conflict with other options that affect the file
+                                                          mode, like fsGroup, and the result can be other mode bits set.
+                                                        format: int32
+                                                        type: integer
+                                                      path:
+                                                        description: |-
+                                                          path is the relative path of the file to map the key to.
+                                                          May not be an absolute path.
+                                                          May not contain the path element '..'.
+                                                          May not start with the string '..'.
+                                                        type: string
+                                                    required:
+                                                    - key
+                                                    - path
+                                                    type: object
+                                                  type: array
+                                                  x-kubernetes-list-type: atomic
+                                                name:
+                                                  default: ""
+                                                  description: |-
+                                                    Name of the referent.
+                                                    This field is effectively required, but due to backwards compatibility is
+                                                    allowed to be empty. Instances of this type with an empty value here are
+                                                    almost certainly wrong.
+                                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                                  type: string
+                                                optional:
+                                                  description: optional field specify
+                                                    whether the Secret or its key
+                                                    must be defined
+                                                  type: boolean
+                                              type: object
+                                              x-kubernetes-map-type: atomic
+                                            serviceAccountToken:
+                                              description: serviceAccountToken is
+                                                information about the serviceAccountToken
+                                                data to project
+                                              properties:
+                                                audience:
+                                                  description: |-
+                                                    audience is the intended audience of the token. A recipient of a token
+                                                    must identify itself with an identifier specified in the audience of the
+                                                    token, and otherwise should reject the token. The audience defaults to the
+                                                    identifier of the apiserver.
+                                                  type: string
+                                                expirationSeconds:
+                                                  description: |-
+                                                    expirationSeconds is the requested duration of validity of the service
+                                                    account token. As the token approaches expiration, the kubelet volume
+                                                    plugin will proactively rotate the service account token. The kubelet will
+                                                    start trying to rotate the token if the token is older than 80 percent of
+                                                    its time to live or if the token is older than 24 hours.Defaults to 1 hour
+                                                    and must be at least 10 minutes.
+                                                  format: int64
+                                                  type: integer
+                                                path:
+                                                  description: |-
+                                                    path is the path relative to the mount point of the file to project the
+                                                    token into.
+                                                  type: string
+                                              required:
+                                              - path
+                                              type: object
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    type: object
+                                  quobyte:
+                                    description: quobyte represents a Quobyte mount
+                                      on the host that shares a pod's lifetime
+                                    properties:
+                                      group:
+                                        description: |-
+                                          group to map volume access to
+                                          Default is no group
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the Quobyte volume to be mounted with read-only permissions.
+                                          Defaults to false.
+                                        type: boolean
+                                      registry:
+                                        description: |-
+                                          registry represents a single or multiple Quobyte Registry services
+                                          specified as a string as host:port pair (multiple entries are separated with commas)
+                                          which acts as the central registry for volumes
+                                        type: string
+                                      tenant:
+                                        description: |-
+                                          tenant owning the given Quobyte volume in the Backend
+                                          Used with dynamically provisioned Quobyte volumes, value is set by the plugin
+                                        type: string
+                                      user:
+                                        description: |-
+                                          user to map volume access to
+                                          Defaults to serivceaccount user
+                                        type: string
+                                      volume:
+                                        description: volume is a string that references
+                                          an already created Quobyte volume by name.
+                                        type: string
+                                    required:
+                                    - registry
+                                    - volume
+                                    type: object
+                                  rbd:
+                                    description: |-
+                                      rbd represents a Rados Block Device mount on the host that shares a pod's lifetime.
+                                      More info: https://examples.k8s.io/volumes/rbd/README.md
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type of the volume that you want to mount.
+                                          Tip: Ensure that the filesystem type is supported by the host operating system.
+                                          Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd
+                                        type: string
+                                      image:
+                                        description: |-
+                                          image is the rados image name.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        type: string
+                                      keyring:
+                                        default: /etc/ceph/keyring
+                                        description: |-
+                                          keyring is the path to key ring for RBDUser.
+                                          Default is /etc/ceph/keyring.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        type: string
+                                      monitors:
+                                        description: |-
+                                          monitors is a collection of Ceph monitors.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      pool:
+                                        default: rbd
+                                        description: |-
+                                          pool is the rados pool name.
+                                          Default is rbd.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly here will force the ReadOnly setting in VolumeMounts.
+                                          Defaults to false.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        type: boolean
+                                      secretRef:
+                                        description: |-
+                                          secretRef is name of the authentication secret for RBDUser. If provided
+                                          overrides keyring.
+                                          Default is nil.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      user:
+                                        default: admin
+                                        description: |-
+                                          user is the rados user name.
+                                          Default is admin.
+                                          More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                                        type: string
+                                    required:
+                                    - image
+                                    - monitors
+                                    type: object
+                                  scaleIO:
+                                    description: scaleIO represents a ScaleIO persistent
+                                      volume attached and mounted on Kubernetes nodes.
+                                    properties:
+                                      fsType:
+                                        default: xfs
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs".
+                                          Default is "xfs".
+                                        type: string
+                                      gateway:
+                                        description: gateway is the host address of
+                                          the ScaleIO API Gateway.
+                                        type: string
+                                      protectionDomain:
+                                        description: protectionDomain is the name
+                                          of the ScaleIO Protection Domain for the
+                                          configured storage.
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly Defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      secretRef:
+                                        description: |-
+                                          secretRef references to the secret for ScaleIO user and other
+                                          sensitive information. If this is not provided, Login operation will fail.
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      sslEnabled:
+                                        description: sslEnabled Flag enable/disable
+                                          SSL communication with Gateway, default
+                                          false
+                                        type: boolean
+                                      storageMode:
+                                        default: ThinProvisioned
+                                        description: |-
+                                          storageMode indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned.
+                                          Default is ThinProvisioned.
+                                        type: string
+                                      storagePool:
+                                        description: storagePool is the ScaleIO Storage
+                                          Pool associated with the protection domain.
+                                        type: string
+                                      system:
+                                        description: system is the name of the storage
+                                          system as configured in ScaleIO.
+                                        type: string
+                                      volumeName:
+                                        description: |-
+                                          volumeName is the name of a volume already created in the ScaleIO system
+                                          that is associated with this volume source.
+                                        type: string
+                                    required:
+                                    - gateway
+                                    - secretRef
+                                    - system
+                                    type: object
+                                  secret:
+                                    description: |-
+                                      secret represents a secret that should populate this volume.
+                                      More info: https://kubernetes.io/docs/concepts/storage/volumes#secret
+                                    properties:
+                                      defaultMode:
+                                        description: |-
+                                          defaultMode is Optional: mode bits used to set permissions on created files by default.
+                                          Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                          YAML accepts both octal and decimal values, JSON requires decimal values
+                                          for mode bits. Defaults to 0644.
+                                          Directories within the path are not affected by this setting.
+                                          This might be in conflict with other options that affect the file
+                                          mode, like fsGroup, and the result can be other mode bits set.
+                                        format: int32
+                                        type: integer
+                                      items:
+                                        description: |-
+                                          items If unspecified, each key-value pair in the Data field of the referenced
+                                          Secret will be projected into the volume as a file whose name is the
+                                          key and content is the value. If specified, the listed keys will be
+                                          projected into the specified paths, and unlisted keys will not be
+                                          present. If a key is specified which is not present in the Secret,
+                                          the volume setup will error unless it is marked optional. Paths must be
+                                          relative and may not contain the '..' path or start with '..'.
+                                        items:
+                                          description: Maps a string key to a path
+                                            within a volume.
+                                          properties:
+                                            key:
+                                              description: key is the key to project.
+                                              type: string
+                                            mode:
+                                              description: |-
+                                                mode is Optional: mode bits used to set permissions on this file.
+                                                Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                                YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                                If not specified, the volume defaultMode will be used.
+                                                This might be in conflict with other options that affect the file
+                                                mode, like fsGroup, and the result can be other mode bits set.
+                                              format: int32
+                                              type: integer
+                                            path:
+                                              description: |-
+                                                path is the relative path of the file to map the key to.
+                                                May not be an absolute path.
+                                                May not contain the path element '..'.
+                                                May not start with the string '..'.
+                                              type: string
+                                          required:
+                                          - key
+                                          - path
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      optional:
+                                        description: optional field specify whether
+                                          the Secret or its keys must be defined
+                                        type: boolean
+                                      secretName:
+                                        description: |-
+                                          secretName is the name of the secret in the pod's namespace to use.
+                                          More info: https://kubernetes.io/docs/concepts/storage/volumes#secret
+                                        type: string
+                                    type: object
+                                  storageos:
+                                    description: storageOS represents a StorageOS
+                                      volume attached and mounted on Kubernetes nodes.
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is the filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      readOnly:
+                                        description: |-
+                                          readOnly defaults to false (read/write). ReadOnly here will force
+                                          the ReadOnly setting in VolumeMounts.
+                                        type: boolean
+                                      secretRef:
+                                        description: |-
+                                          secretRef specifies the secret to use for obtaining the StorageOS API
+                                          credentials.  If not specified, default values will be attempted.
+                                        properties:
+                                          name:
+                                            default: ""
+                                            description: |-
+                                              Name of the referent.
+                                              This field is effectively required, but due to backwards compatibility is
+                                              allowed to be empty. Instances of this type with an empty value here are
+                                              almost certainly wrong.
+                                              More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                            type: string
+                                        type: object
+                                        x-kubernetes-map-type: atomic
+                                      volumeName:
+                                        description: |-
+                                          volumeName is the human-readable name of the StorageOS volume.  Volume
+                                          names are only unique within a namespace.
+                                        type: string
+                                      volumeNamespace:
+                                        description: |-
+                                          volumeNamespace specifies the scope of the volume within StorageOS.  If no
+                                          namespace is specified then the Pod's namespace will be used.  This allows the
+                                          Kubernetes name scoping to be mirrored within StorageOS for tighter integration.
+                                          Set VolumeName to any name to override the default behaviour.
+                                          Set to "default" if you are not using namespaces within StorageOS.
+                                          Namespaces that do not pre-exist within StorageOS will be created.
+                                        type: string
+                                    type: object
+                                  vsphereVolume:
+                                    description: vsphereVolume represents a vSphere
+                                      volume attached and mounted on kubelets host
+                                      machine
+                                    properties:
+                                      fsType:
+                                        description: |-
+                                          fsType is filesystem type to mount.
+                                          Must be a filesystem type supported by the host operating system.
+                                          Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                                        type: string
+                                      storagePolicyID:
+                                        description: storagePolicyID is the storage
+                                          Policy Based Management (SPBM) profile ID
+                                          associated with the StoragePolicyName.
+                                        type: string
+                                      storagePolicyName:
+                                        description: storagePolicyName is the storage
+                                          Policy Based Management (SPBM) profile name.
+                                        type: string
+                                      volumePath:
+                                        description: volumePath is the path that identifies
+                                          vSphere volume vmdk
+                                        type: string
+                                    required:
+                                    - volumePath
+                                    type: object
+                                required:
+                                - name
+                                type: object
+                              type: array
+                              x-kubernetes-list-map-keys:
+                              - name
+                              x-kubernetes-list-type: map
+                          required:
+                          - containers
+                          type: object
+                      type: object
+                    topologyRequest:
+                      description: topologyRequest defines the topology request for
+                        the PodSet.
+                      properties:
+                        podIndexLabel:
+                          description: |-
+                            PodIndexLabel indicates the name of the label indexing the pods.
+                            For example, in the context of
+                            - kubernetes job this is: kubernetes.io/job-completion-index
+                            - JobSet: kubernetes.io/job-completion-index (inherited from Job)
+                            - Kubeflow: training.kubeflow.org/replica-index
+                          type: string
+                        preferred:
+                          description: |-
+                            preferred indicates the topology level preferred by the PodSet, as
+                            indicated by the `kueue.x-k8s.io/podset-preferred-topology` PodSet
+                            annotation.
+                          type: string
+                        required:
+                          description: |-
+                            required indicates the topology level required by the PodSet, as
+                            indicated by the `kueue.x-k8s.io/podset-required-topology` PodSet
+                            annotation.
+                          type: string
+                        subGroupCount:
+                          description: |-
+                            SubGroupIndexLabel indicates the count of replicated Jobs (groups) within a PodSet.
+                            For example, in the context of JobSet this value is read from jobset.sigs.k8s.io/replicatedjob-replicas.
+                          format: int32
+                          type: integer
+                        subGroupIndexLabel:
+                          description: |-
+                            SubGroupIndexLabel indicates the name of the label indexing the instances of replicated Jobs (groups)
+                            within a PodSet. For example, in the context of JobSet this is jobset.sigs.k8s.io/job-index.
+                          type: string
+                      type: object
+                  required:
+                  - count
+                  - template
+                  type: object
+                  x-kubernetes-validations:
+                  - message: minCount should be positive and less or equal to count
+                    rule: 'has(self.minCount) ? self.minCount <= self.count : true'
+                maxItems: 8
+                minItems: 1
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              priority:
+                description: |-
+                  Priority determines the order of access to the resources managed by the
+                  ClusterQueue where the workload is queued.
+                  The priority value is populated from PriorityClassName.
+                  The higher the value, the higher the priority.
+                  If priorityClassName is specified, priority must not be null.
+                format: int32
+                type: integer
+              priorityClassName:
+                description: |-
+                  If specified, indicates the workload's priority.
+                  "system-node-critical" and "system-cluster-critical" are two special
+                  keywords which indicate the highest priorities with the former being
+                  the highest priority. Any other name must be defined by creating a
+                  PriorityClass object with that name. If not specified, the workload
+                  priority will be default or zero if there is no default.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+              priorityClassSource:
+                default: ""
+                description: |-
+                  priorityClassSource determines whether the priorityClass field refers to a pod PriorityClass or kueue.x-k8s.io/workloadpriorityclass.
+                  Workload's PriorityClass can accept the name of a pod priorityClass or a workloadPriorityClass.
+                  When using pod PriorityClass, a priorityClassSource field has the scheduling.k8s.io/priorityclass value.
+                enum:
+                - kueue.x-k8s.io/workloadpriorityclass
+                - scheduling.k8s.io/priorityclass
+                - ""
+                type: string
+              queueName:
+                description: |-
+                  queueName is the name of the LocalQueue the Workload is associated with.
+                  queueName cannot be changed while .status.admission is not null.
+                maxLength: 253
+                pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                type: string
+            required:
+            - podSets
+            type: object
+            x-kubernetes-validations:
+            - message: priority should not be nil when priorityClassName is set
+              rule: 'has(self.priorityClassName) ? has(self.priority) : true'
+          status:
+            description: WorkloadStatus defines the observed state of Workload
+            properties:
+              accumulatedPastExexcutionTimeSeconds:
+                description: |-
+                  accumulatedPastExexcutionTimeSeconds holds the total time, in seconds, the workload spent
+                  in Admitted state, in the previous `Admit` - `Evict` cycles.
+                format: int32
+                type: integer
+              admission:
+                description: |-
+                  admission holds the parameters of the admission of the workload by a
+                  ClusterQueue. admission can be set back to null, but its fields cannot be
+                  changed once set.
+                properties:
+                  clusterQueue:
+                    description: clusterQueue is the name of the ClusterQueue that
+                      admitted this workload.
+                    maxLength: 253
+                    pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                    type: string
+                  podSetAssignments:
+                    description: PodSetAssignments hold the admission results for
+                      each of the .spec.podSets entries.
+                    items:
+                      properties:
+                        count:
+                          description: |-
+                            count is the number of pods taken into account at admission time.
+                            This field will not change in case of quota reclaim.
+                            Value could be missing for Workloads created before this field was added,
+                            in that case spec.podSets[*].count value will be used.
+                          format: int32
+                          minimum: 0
+                          type: integer
+                        flavors:
+                          additionalProperties:
+                            description: ResourceFlavorReference is the name of the
+                              ResourceFlavor.
+                            maxLength: 253
+                            pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$
+                            type: string
+                          description: Flavors are the flavors assigned to the workload
+                            for each resource.
+                          type: object
+                        name:
+                          default: main
+                          description: Name is the name of the podSet. It should match
+                            one of the names in .spec.podSets.
+                          maxLength: 63
+                          pattern: ^(?i)[a-z0-9]([-a-z0-9]*[a-z0-9])?$
+                          type: string
+                        resourceUsage:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: |-
+                            resourceUsage keeps track of the total resources all the pods in the podset need to run.
+
+                            Beside what is provided in podSet's specs, this calculation takes into account
+                            the LimitRange defaults and RuntimeClass overheads at the moment of admission.
+                            This field will not change in case of quota reclaim.
+                          type: object
+                        topologyAssignment:
+                          description: |-
+                            topologyAssignment indicates the topology assignment divided into
+                            topology domains corresponding to the lowest level of the topology.
+                            The assignment specifies the number of Pods to be scheduled per topology
+                            domain and specifies the node selectors for each topology domain, in the
+                            following way: the node selector keys are specified by the levels field
+                            (same for all domains), and the corresponding node selector value is
+                            specified by the domains.values subfield. If the TopologySpec.Levels field contains
+                            "kubernetes.io/hostname" label, topologyAssignment will contain data only for
+                            this label, and omit higher levels in the topology
+
+                            Example:
+
+                            topologyAssignment:
+                              levels:
+                              - cloud.provider.com/topology-block
+                              - cloud.provider.com/topology-rack
+                              domains:
+                              - values: [block-1, rack-1]
+                                count: 4
+                              - values: [block-1, rack-2]
+                                count: 2
+
+                            Here:
+                            - 4 Pods are to be scheduled on nodes matching the node selector:
+                              cloud.provider.com/topology-block: block-1
+                              cloud.provider.com/topology-rack: rack-1
+                            - 2 Pods are to be scheduled on nodes matching the node selector:
+                              cloud.provider.com/topology-block: block-1
+                              cloud.provider.com/topology-rack: rack-2
+
+                            Example:
+                            Below there is an equivalent of the above example assuming, Topology
+                            object defines kubernetes.io/hostname as the lowest level in topology.
+                            Hence we omit higher level of topologies, since the hostname label
+                            is sufficient to explicitly identify a proper node.
+
+                            topologyAssignment:
+                              levels:
+                              - kubernetes.io/hostname
+                              domains:
+                              - values: [hostname-1]
+                                count: 4
+                              - values: [hostname-2]
+                                count: 2
+                          properties:
+                            domains:
+                              description: |-
+                                domains is a list of topology assignments split by topology domains at
+                                the lowest level of the topology.
+                              items:
+                                properties:
+                                  count:
+                                    description: |-
+                                      count indicates the number of Pods to be scheduled in the topology
+                                      domain indicated by the values field.
+                                    format: int32
+                                    minimum: 1
+                                    type: integer
+                                  values:
+                                    description: |-
+                                      values is an ordered list of node selector values describing a topology
+                                      domain. The values correspond to the consecutive topology levels, from
+                                      the highest to the lowest.
+                                    items:
+                                      type: string
+                                    maxItems: 8
+                                    minItems: 1
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                required:
+                                - count
+                                - values
+                                type: object
+                              type: array
+                            levels:
+                              description: |-
+                                levels is an ordered list of keys denoting the levels of the assigned
+                                topology (i.e. node label keys), from the highest to the lowest level of
+                                the topology.
+                              items:
+                                type: string
+                              maxItems: 8
+                              minItems: 1
+                              type: array
+                              x-kubernetes-list-type: atomic
+                          required:
+                          - domains
+                          - levels
+                          type: object
+                      required:
+                      - name
+                      type: object
+                    maxItems: 8
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                required:
+                - clusterQueue
+                - podSetAssignments
+                type: object
+              admissionChecks:
+                description: admissionChecks list all the admission checks required
+                  by the workload and the current status
+                items:
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    name:
+                      description: name identifies the admission check.
+                      maxLength: 316
+                      type: string
+                    podSetUpdates:
+                      items:
+                        description: |-
+                          PodSetUpdate contains a list of pod set modifications suggested by AdmissionChecks.
+                          The modifications should be additive only - modifications of already existing keys
+                          or having the same key provided by multiple AdmissionChecks is not allowed and will
+                          result in failure during workload admission.
+                        properties:
+                          annotations:
+                            additionalProperties:
+                              type: string
+                            type: object
+                          labels:
+                            additionalProperties:
+                              type: string
+                            type: object
+                          name:
+                            description: Name of the PodSet to modify. Should match
+                              to one of the Workload's PodSets.
+                            type: string
+                          nodeSelector:
+                            additionalProperties:
+                              type: string
+                            type: object
+                          tolerations:
+                            items:
+                              description: |-
+                                The pod this Toleration is attached to tolerates any taint that matches
+                                the triple <key,value,effect> using the matching operator <operator>.
+                              properties:
+                                effect:
+                                  description: |-
+                                    Effect indicates the taint effect to match. Empty means match all taint effects.
+                                    When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                                  type: string
+                                key:
+                                  description: |-
+                                    Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                                    If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                                  type: string
+                                operator:
+                                  description: |-
+                                    Operator represents a key's relationship to the value.
+                                    Valid operators are Exists and Equal. Defaults to Equal.
+                                    Exists is equivalent to wildcard for value, so that a pod can
+                                    tolerate all taints of a particular category.
+                                  type: string
+                                tolerationSeconds:
+                                  description: |-
+                                    TolerationSeconds represents the period of time the toleration (which must be
+                                    of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                                    it is not set, which means tolerate the taint forever (do not evict). Zero and
+                                    negative values will be treated as 0 (evict immediately) by the system.
+                                  format: int64
+                                  type: integer
+                                value:
+                                  description: |-
+                                    Value is the taint value the toleration matches to.
+                                    If the operator is Exists, the value should be empty, otherwise just a regular string.
+                                  type: string
+                              type: object
+                            maxItems: 8
+                            type: array
+                            x-kubernetes-validations:
+                            - message: operator must be Exists when 'key' is empty,
+                                which means 'match all values and all keys'
+                              rule: 'self.all(x, !has(x.key) ? x.operator == ''Exists''
+                                : true)'
+                            - message: effect must be 'NoExecute' when 'tolerationSeconds'
+                                is set
+                              rule: 'self.all(x, has(x.tolerationSeconds) ? x.effect
+                                == ''NoExecute'' : true)'
+                            - message: 'supported toleration values: ''Equal''(default),
+                                ''Exists'''
+                              rule: self.all(x, !has(x.operator) || x.operator in
+                                ['Equal', 'Exists'])
+                            - message: a value must be empty when 'operator' is 'Exists'
+                              rule: 'self.all(x, has(x.operator) && x.operator ==
+                                ''Exists'' ? !has(x.value) : true)'
+                            - message: 'supported taint effect values: ''NoSchedule'',
+                                ''PreferNoSchedule'', ''NoExecute'''
+                              rule: self.all(x, !has(x.effect) || x.effect in ['NoSchedule',
+                                'PreferNoSchedule', 'NoExecute'])
+                        required:
+                        - name
+                        type: object
+                      maxItems: 8
+                      type: array
+                      x-kubernetes-list-type: atomic
+                    state:
+                      description: state of the admissionCheck, one of Pending, Ready,
+                        Retry, Rejected
+                      enum:
+                      - Pending
+                      - Ready
+                      - Retry
+                      - Rejected
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - name
+                  - state
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              conditions:
+                description: |-
+                  conditions hold the latest available observations of the Workload
+                  current state.
+
+                  The type of the condition could be:
+
+                  - Admitted: the Workload was admitted through a ClusterQueue.
+                  - Finished: the associated workload finished running (failed or succeeded).
+                  - PodsReady: at least `.spec.podSets[*].count` Pods are ready or have
+                  succeeded.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+              reclaimablePods:
+                description: |-
+                  reclaimablePods keeps track of the number pods within a podset for which
+                  the resource reservation is no longer needed.
+                items:
+                  properties:
+                    count:
+                      description: count is the number of pods for which the requested
+                        resources are no longer needed.
+                      format: int32
+                      minimum: 0
+                      type: integer
+                    name:
+                      description: name is the PodSet name.
+                      type: string
+                  required:
+                  - count
+                  - name
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              requeueState:
+                description: |-
+                  requeueState holds the re-queue state
+                  when a workload meets Eviction with PodsReadyTimeout reason.
+                properties:
+                  count:
+                    description: |-
+                      count records the number of times a workload has been re-queued
+                      When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
+                      this count would be reset to null.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  requeueAt:
+                    description: |-
+                      requeueAt records the time when a workload will be re-queued.
+                      When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
+                      this time would be reset to null.
+                    format: date-time
+                    type: string
+                type: object
+              resourceRequests:
+                description: |-
+                  resourceRequests provides a detailed view of the resources that were
+                  requested by a non-admitted workload when it was considered for admission.
+                  If admission is non-null, resourceRequests will be empty because
+                  admission.resourceUsage contains the detailed information.
+                items:
+                  properties:
+                    name:
+                      default: main
+                      description: name is the name of the podSet. It should match
+                        one of the names in .spec.podSets.
+                      maxLength: 63
+                      pattern: ^(?i)[a-z0-9]([-a-z0-9]*[a-z0-9])?$
+                      type: string
+                    resources:
+                      additionalProperties:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                      description: |-
+                        resources is the total resources all the pods in the podset need to run.
+
+                        Beside what is provided in podSet's specs, this value also takes into account
+                        the LimitRange defaults and RuntimeClass overheads at the moment of consideration
+                        and the application of resource.excludeResourcePrefixes and resource.transformations.
+                      type: object
+                  required:
+                  - name
+                  type: object
+                maxItems: 8
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+        x-kubernetes-validations:
+        - message: podSetAssignments must have the same number of podSets as the spec
+          rule: 'has(self.status) && has(self.status.conditions) && self.status.conditions.exists(c,
+            c.type == ''QuotaReserved'' && c.status == ''True'') && has(self.status.admission)
+            ? size(self.spec.podSets) == size(self.status.admission.podSetAssignments)
+            : true'
+        - message: field is immutable
+          rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c,
+            c.type == ''QuotaReserved'' && c.status == ''True'')) ? (oldSelf.spec.priorityClassSource
+            == self.spec.priorityClassSource) : true'
+        - message: field is immutable
+          rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c,
+            c.type == ''QuotaReserved'' && c.status == ''True'') && has(oldSelf.spec.priorityClassName)
+            && has(self.spec.priorityClassName)) ? (oldSelf.spec.priorityClassName
+            == self.spec.priorityClassName) : true'
+        - message: field is immutable
+          rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c,
+            c.type == ''QuotaReserved'' && c.status == ''True'')) && (has(self.status)
+            && has(self.status.conditions) && self.status.conditions.exists(c, c.type
+            == ''QuotaReserved'' && c.status == ''True'')) && has(oldSelf.spec.queueName)
+            && has(self.spec.queueName) ? oldSelf.spec.queueName == self.spec.queueName
+            : true'
+        - message: maximumExecutionTimeSeconds is immutable while admitted
+          rule: ((has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c,
+            c.type == 'Admitted' && c.status == 'True')) && (has(self.status) && has(self.status.conditions)
+            && self.status.conditions.exists(c, c.type == 'Admitted' && c.status ==
+            'True')))?((has(oldSelf.spec.maximumExecutionTimeSeconds)?oldSelf.spec.maximumExecutionTimeSeconds:0)
+            ==  (has(self.spec.maximumExecutionTimeSeconds)?self.spec.maximumExecutionTimeSeconds:0)):true
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-controller-manager
+  namespace: kueue-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-leader-election-role
+  namespace: kueue-system
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - configmaps
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+---
+aggregationRule:
+  clusterRoleSelectors:
+  - matchLabels:
+      rbac.kueue.x-k8s.io/batch-admin: "true"
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-batch-admin-role
+---
+aggregationRule:
+  clusterRoleSelectors:
+  - matchLabels:
+      rbac.kueue.x-k8s.io/batch-user: "true"
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-batch-user-role
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-clusterqueue-editor-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - clusterqueues
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - clusterqueues/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-clusterqueue-viewer-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - clusterqueues
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - clusterqueues/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-job-editor-role
+rules:
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - batch
+  resources:
+  - jobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-job-viewer-role
+rules:
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - batch
+  resources:
+  - jobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-jobset-editor-role
+rules:
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-jobset-viewer-role
+rules:
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-localqueue-editor-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - localqueues
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - localqueues/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-localqueue-viewer-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - localqueues
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - localqueues/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-manager-role
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - limitranges
+  - namespaces
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - pods/finalizers
+  verbs:
+  - get
+  - update
+- apiGroups:
+  - ""
+  resources:
+  - pods/status
+  verbs:
+  - get
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - podtemplates
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - secrets
+  verbs:
+  - get
+  - list
+  - update
+  - watch
+- apiGroups:
+  - admissionregistration.k8s.io
+  resources:
+  - mutatingwebhookconfigurations
+  - validatingwebhookconfigurations
+  verbs:
+  - get
+  - list
+  - update
+  - watch
+- apiGroups:
+  - admissionregistration.k8s.io
+  resources:
+  - validatingadmissionpolicies
+  - validatingadmissionpolicybindings
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - apps
+  resources:
+  - replicasets
+  - statefulsets
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - autoscaling.x-k8s.io
+  resources:
+  - provisioningrequests
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - autoscaling.x-k8s.io
+  resources:
+  - provisioningrequests/status
+  verbs:
+  - get
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - batch
+  resources:
+  - jobs/finalizers
+  - jobs/status
+  verbs:
+  - get
+  - patch
+  - update
+- apiGroups:
+  - flowcontrol.apiserver.k8s.io
+  resources:
+  - flowschemas
+  - prioritylevelconfigurations
+  verbs:
+  - list
+  - watch
+- apiGroups:
+  - flowcontrol.apiserver.k8s.io
+  resources:
+  - flowschemas/status
+  verbs:
+  - patch
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets
+  verbs:
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets/finalizers
+  verbs:
+  - get
+  - update
+- apiGroups:
+  - jobset.x-k8s.io
+  resources:
+  - jobsets/status
+  verbs:
+  - get
+  - patch
+  - update
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs
+  - mxjobs
+  - paddlejobs
+  - pytorchjobs
+  - tfjobs
+  - xgboostjobs
+  verbs:
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs/finalizers
+  - mxjobs/finalizers
+  - mxjobs/status
+  - paddlejobs/finalizers
+  - pytorchjobs/finalizers
+  - tfjobs/finalizers
+  - xgboostjobs/finalizers
+  verbs:
+  - get
+  - update
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs/status
+  - paddlejobs/status
+  - pytorchjobs/status
+  - tfjobs/status
+  - xgboostjobs/status
+  verbs:
+  - get
+  - patch
+  - update
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - admissionchecks
+  - clusterqueues
+  - cohorts
+  - localqueues
+  - workloads
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - admissionchecks/finalizers
+  - clusterqueues/finalizers
+  - localqueues/finalizers
+  - resourceflavors/finalizers
+  - topology/finalizers
+  - workloads/finalizers
+  verbs:
+  - update
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - admissionchecks/status
+  - clusterqueues/status
+  - localqueues/status
+  - multikueueclusters/status
+  - workloads/status
+  verbs:
+  - get
+  - patch
+  - update
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - multikueueclusters
+  - multikueueconfigs
+  - provisioningrequestconfigs
+  - topologies
+  - workloadpriorityclasses
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - resourceflavors
+  verbs:
+  - delete
+  - get
+  - list
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - topology
+  verbs:
+  - get
+  - list
+  - update
+  - watch
+- apiGroups:
+  - node.k8s.io
+  resources:
+  - runtimeclasses
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters
+  - rayjobs
+  verbs:
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters/finalizers
+  - rayclusters/status
+  - rayjobs/finalizers
+  - rayjobs/status
+  verbs:
+  - get
+  - update
+- apiGroups:
+  - scheduling.k8s.io
+  resources:
+  - priorityclasses
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-metrics-reader
+rules:
+- nonResourceURLs:
+  - /metrics
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-mpijob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-mpijob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mpijobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-mxjob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mxjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mxjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-mxjob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mxjobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - mxjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-paddlejob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - paddlejobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - paddlejobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-paddlejob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - paddlejobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - paddlejobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-pending-workloads-cq-viewer-role
+rules:
+- apiGroups:
+  - visibility.kueue.x-k8s.io
+  resources:
+  - clusterqueues/pendingworkloads
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-pending-workloads-lq-viewer-role
+rules:
+- apiGroups:
+  - visibility.kueue.x-k8s.io
+  resources:
+  - localqueues/pendingworkloads
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-proxy-role
+rules:
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-pytorchjob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - pytorchjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - pytorchjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-pytorchjob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - pytorchjobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - pytorchjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-raycluster-editor-role
+rules:
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-raycluster-viewer-role
+rules:
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayclusters/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-rayjob-editor-role
+rules:
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-rayjob-viewer-role
+rules:
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-resourceflavor-editor-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - resourceflavors
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-resourceflavor-viewer-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - resourceflavors
+  verbs:
+  - get
+  - list
+  - watch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-tfjob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - tfjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - tfjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-tfjob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - tfjobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - tfjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+  name: kueue-workload-editor-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - workloads
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - workloads/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-workload-viewer-role
+rules:
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - workloads
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - workloads/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-xgboostjob-editor-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - xgboostjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - xgboostjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+    rbac.kueue.x-k8s.io/batch-admin: "true"
+    rbac.kueue.x-k8s.io/batch-user: "true"
+  name: kueue-xgboostjob-viewer-role
+rules:
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - xgboostjobs
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - xgboostjobs/status
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-visibility-server-auth-reader
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: extension-apiserver-authentication-reader
+subjects:
+- kind: ServiceAccount
+  name: kueue-controller-manager
+  namespace: kueue-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-leader-election-rolebinding
+  namespace: kueue-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: kueue-leader-election-role
+subjects:
+- kind: ServiceAccount
+  name: kueue-controller-manager
+  namespace: kueue-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-manager-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kueue-manager-role
+subjects:
+- kind: ServiceAccount
+  name: kueue-controller-manager
+  namespace: kueue-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-proxy-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kueue-proxy-role
+subjects:
+- kind: ServiceAccount
+  name: kueue-controller-manager
+  namespace: kueue-system
+---
+apiVersion: v1
+data:
+  controller_manager_config.yaml: |
+    apiVersion: config.kueue.x-k8s.io/v1beta1
+    kind: Configuration
+    health:
+      healthProbeBindAddress: :8081
+    metrics:
+      bindAddress: :8080
+    # enableClusterQueueResources: true
+    webhook:
+      port: 9443
+    leaderElection:
+      leaderElect: true
+      resourceName: c1f6bfd2.kueue.x-k8s.io
+    controller:
+      groupKindConcurrency:
+        Job.batch: 5
+        Pod: 5
+        Workload.kueue.x-k8s.io: 5
+        LocalQueue.kueue.x-k8s.io: 1
+        Cohort.kueue.x-k8s.io: 1
+        ClusterQueue.kueue.x-k8s.io: 1
+        ResourceFlavor.kueue.x-k8s.io: 1
+    clientConnection:
+      qps: 50
+      burst: 100
+    #pprofBindAddress: :8083
+    #waitForPodsReady:
+    #  enable: false
+    #  timeout: 5m
+    #  blockAdmission: false
+    #  requeuingStrategy:
+    #    timestamp: Eviction
+    #    backoffLimitCount: null # null indicates infinite requeuing
+    #    backoffBaseSeconds: 60
+    #    backoffMaxSeconds: 3600
+    #manageJobsWithoutQueueName: true
+    #managedJobsNamespaceSelector:
+    #  matchLabels:
+    #    kueue-managed: "true"
+    #internalCertManagement:
+    #  enable: false
+    #  webhookServiceName: ""
+    #  webhookSecretName: ""
+    integrations:
+      frameworks:
+      - "batch/job"
+      - "kubeflow.org/mpijob"
+      - "ray.io/rayjob"
+      - "ray.io/raycluster"
+      - "jobset.x-k8s.io/jobset"
+      - "kubeflow.org/mxjob"
+      - "kubeflow.org/paddlejob"
+      - "kubeflow.org/pytorchjob"
+      - "kubeflow.org/tfjob"
+      - "kubeflow.org/xgboostjob"
+    #  - "pod"
+    #  - "deployment" # requires enabling pod integration
+    #  - "statefulset" # requires enabling pod integration
+    #  externalFrameworks:
+    #  - "Foo.v1.example.com"
+    #  podOptions:
+    #    namespaceSelector:
+    #      matchExpressions:
+    #        - key: kubernetes.io/metadata.name
+    #          operator: NotIn
+    #          values: [ kube-system, kueue-system ]
+    #fairSharing:
+    #  enable: true
+    #  preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare]
+    #resources:
+    #  excludeResourcePrefixes: []
+    #  transformations:
+    #  - input: nvidia.com/mig-4g.5gb
+    #    strategy: Replace | Retain
+    #    outputs:
+    #      example.com/accelerator-memory: 5Gi
+    #      example.com/accelerator-gpc: 4
+kind: ConfigMap
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-manager-config
+  namespace: kueue-system
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-webhook-server-cert
+  namespace: kueue-system
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-controller-manager-metrics-service
+  namespace: kueue-system
+spec:
+  ports:
+  - name: https
+    port: 8443
+    protocol: TCP
+    targetPort: https
+  selector:
+    control-plane: controller-manager
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-visibility-server
+  namespace: kueue-system
+spec:
+  ports:
+  - name: https
+    port: 443
+    protocol: TCP
+    targetPort: 8082
+  selector:
+    control-plane: controller-manager
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-webhook-service
+  namespace: kueue-system
+spec:
+  ports:
+  - port: 443
+    protocol: TCP
+    targetPort: 9443
+  selector:
+    control-plane: controller-manager
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-controller-manager
+  namespace: kueue-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      control-plane: controller-manager
+  template:
+    metadata:
+      annotations:
+        kubectl.kubernetes.io/default-container: manager
+      labels:
+        app.kubernetes.io/component: controller
+        app.kubernetes.io/name: kueue
+        control-plane: controller-manager
+    spec:
+      containers:
+      - args:
+        - --config=/controller_manager_config.yaml
+        - --zap-log-level=2
+        - --feature-gates=TopologyAwareScheduling=true
+        command:
+        - /manager
+        image: registry.k8s.io/kueue/kueue:v0.10.0
+        imagePullPolicy: Always
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8081
+          initialDelaySeconds: 15
+          periodSeconds: 20
+        name: manager
+        ports:
+        - containerPort: 8082
+          name: visibility
+          protocol: TCP
+        - containerPort: 9443
+          name: webhook-server
+          protocol: TCP
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: 8081
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        resources:
+          limits:
+            cpu: 500m
+            memory: 512Mi
+          requests:
+            cpu: 500m
+            memory: 512Mi
+        securityContext:
+          allowPrivilegeEscalation: false
+        volumeMounts:
+        - mountPath: /tmp/k8s-webhook-server/serving-certs
+          name: cert
+          readOnly: true
+        - mountPath: /controller_manager_config.yaml
+          name: manager-config
+          subPath: controller_manager_config.yaml
+      - args:
+        - --secure-listen-address=0.0.0.0:8443
+        - --upstream=http://127.0.0.1:8080/
+        - --logtostderr=true
+        - --v=10
+        image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0
+        name: kube-rbac-proxy
+        ports:
+        - containerPort: 8443
+          name: https
+          protocol: TCP
+      securityContext:
+        runAsNonRoot: true
+      serviceAccountName: kueue-controller-manager
+      terminationGracePeriodSeconds: 10
+      volumes:
+      - name: cert
+        secret:
+          defaultMode: 420
+          secretName: kueue-webhook-server-cert
+      - configMap:
+          name: kueue-manager-config
+        name: manager-config
+      - effect: NoSchedule
+        key: components.gke.io/gke-managed-components
+        operator: Equal
+        value: "true"
+---
+apiVersion: apiregistration.k8s.io/v1
+kind: APIService
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: v1beta1.visibility.kueue.x-k8s.io
+spec:
+  group: visibility.kueue.x-k8s.io
+  groupPriorityMinimum: 100
+  insecureSkipTLSVerify: true
+  service:
+    name: kueue-visibility-server
+    namespace: kueue-system
+  version: v1beta1
+  versionPriority: 100
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: MutatingWebhookConfiguration
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-mutating-webhook-configuration
+webhooks:
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate--v1-pod
+  failurePolicy: Fail
+  name: mpod.kb.io
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values:
+      - kube-system
+      - kueue-system
+  rules:
+  - apiGroups:
+    - ""
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - pods
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-apps-v1-deployment
+  failurePolicy: Fail
+  name: mdeployment.kb.io
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values:
+      - kube-system
+      - kueue-system
+  rules:
+  - apiGroups:
+    - apps
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - deployments
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-batch-v1-job
+  failurePolicy: Fail
+  name: mjob.kb.io
+  rules:
+  - apiGroups:
+    - batch
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - jobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-jobset-x-k8s-io-v1alpha2-jobset
+  failurePolicy: Fail
+  name: mjobset.kb.io
+  rules:
+  - apiGroups:
+    - jobset.x-k8s.io
+    apiVersions:
+    - v1alpha2
+    operations:
+    - CREATE
+    resources:
+    - jobsets
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v1-mxjob
+  failurePolicy: Fail
+  name: mmxjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - mxjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v1-paddlejob
+  failurePolicy: Fail
+  name: mpaddlejob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - paddlejobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v1-pytorchjob
+  failurePolicy: Fail
+  name: mpytorchjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - pytorchjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v1-tfjob
+  failurePolicy: Fail
+  name: mtfjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - tfjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v1-xgboostjob
+  failurePolicy: Fail
+  name: mxgboostjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - xgboostjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kubeflow-org-v2beta1-mpijob
+  failurePolicy: Fail
+  name: mmpijob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v2beta1
+    operations:
+    - CREATE
+    resources:
+    - mpijobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-ray-io-v1-raycluster
+  failurePolicy: Fail
+  name: mraycluster.kb.io
+  rules:
+  - apiGroups:
+    - ray.io
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - rayclusters
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-ray-io-v1-rayjob
+  failurePolicy: Fail
+  name: mrayjob.kb.io
+  rules:
+  - apiGroups:
+    - ray.io
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - rayjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-apps-v1-statefulset
+  failurePolicy: Fail
+  name: mstatefulset.kb.io
+  rules:
+  - apiGroups:
+    - apps
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - statefulsets
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kueue-x-k8s-io-v1beta1-clusterqueue
+  failurePolicy: Fail
+  name: mclusterqueue.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    resources:
+    - clusterqueues
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kueue-x-k8s-io-v1beta1-resourceflavor
+  failurePolicy: Fail
+  name: mresourceflavor.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    resources:
+    - resourceflavors
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /mutate-kueue-x-k8s-io-v1beta1-workload
+  failurePolicy: Fail
+  name: mworkload.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    resources:
+    - workloads
+  sideEffects: None
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingWebhookConfiguration
+metadata:
+  labels:
+    app.kubernetes.io/component: controller
+    app.kubernetes.io/name: kueue
+    control-plane: controller-manager
+  name: kueue-validating-webhook-configuration
+webhooks:
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate--v1-pod
+  failurePolicy: Fail
+  name: vpod.kb.io
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values:
+      - kube-system
+      - kueue-system
+  rules:
+  - apiGroups:
+    - ""
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - pods
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-apps-v1-deployment
+  failurePolicy: Fail
+  name: vdeployment.kb.io
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values:
+      - kube-system
+      - kueue-system
+  rules:
+  - apiGroups:
+    - apps
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - deployments
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-batch-v1-job
+  failurePolicy: Fail
+  name: vjob.kb.io
+  rules:
+  - apiGroups:
+    - batch
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - jobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-jobset-x-k8s-io-v1alpha2-jobset
+  failurePolicy: Fail
+  name: vjobset.kb.io
+  rules:
+  - apiGroups:
+    - jobset.x-k8s.io
+    apiVersions:
+    - v1alpha2
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - jobsets
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v1-mxjob
+  failurePolicy: Fail
+  name: vmxjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - mxjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v1-paddlejob
+  failurePolicy: Fail
+  name: vpaddlejob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - paddlejobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v1-pytorchjob
+  failurePolicy: Fail
+  name: vpytorchjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - pytorchjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v1-tfjob
+  failurePolicy: Fail
+  name: vtfjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - tfjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v1-xgboostjob
+  failurePolicy: Fail
+  name: vxgboostjob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - xgboostjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kubeflow-org-v2beta1-mpijob
+  failurePolicy: Fail
+  name: vmpijob.kb.io
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v2beta1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - mpijobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-ray-io-v1-raycluster
+  failurePolicy: Fail
+  name: vraycluster.kb.io
+  rules:
+  - apiGroups:
+    - ray.io
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - rayclusters
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-ray-io-v1-rayjob
+  failurePolicy: Fail
+  name: vrayjob.kb.io
+  rules:
+  - apiGroups:
+    - ray.io
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - rayjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-apps-v1-statefulset
+  failurePolicy: Fail
+  name: vstatefulset.kb.io
+  rules:
+  - apiGroups:
+    - apps
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - statefulsets
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kueue-x-k8s-io-v1beta1-clusterqueue
+  failurePolicy: Fail
+  name: vclusterqueue.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - clusterqueues
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kueue-x-k8s-io-v1alpha1-cohort
+  failurePolicy: Fail
+  name: vcohort.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1alpha1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - cohorts
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kueue-x-k8s-io-v1beta1-resourceflavor
+  failurePolicy: Fail
+  name: vresourceflavor.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - resourceflavors
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: kueue-webhook-service
+      namespace: kueue-system
+      path: /validate-kueue-x-k8s-io-v1beta1-workload
+  failurePolicy: Fail
+  name: vworkload.kb.io
+  rules:
+  - apiGroups:
+    - kueue.x-k8s.io
+    apiVersions:
+    - v1beta1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - workloads
+    - workloads/status
+  sideEffects: None
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index c493332e7c..6c65342e65 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -15,7 +15,7 @@
   */
 
 locals {
-  kueue_supported_versions  = ["v0.9.1", "v0.9.0", "v0.8.1"]
+  kueue_supported_versions  = ["v0.10.0", "v0.9.1", "v0.9.0", "v0.8.1"]
   jobset_supported_versions = ["v0.7.1", "v0.5.2"]
 }
 
diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
index 726768ffa8..a09e180732 100644
--- a/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
@@ -93,6 +93,6 @@ deployment_groups:
     settings:
       kueue:
         install: true
-        version: v0.9.0
+        version: v0.10.0
       jobset:
         install: true

From 539711e463759d760a32f6c51d37b7ba5f0f2c91 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Tue, 17 Dec 2024 20:27:08 +0000
Subject: [PATCH 034/140] Fix clean_up placement policy bug

---
 .../modules/slurm_files/scripts/slurmsync.py           | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index 21d9324e79..b06d093e78 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -303,13 +303,11 @@ def _seconds_since_timestamp(timestamp):
 
 
 def delete_placement_groups(placement_groups):
-    def delete_placement_request(pg_name, region):
-        return lookup().compute.resourcePolicies().delete(
-            project=lookup().project, region=region, resourcePolicy=pg_name
-        )
-
     requests = {
-        pg.name: delete_placement_request(pg["name"], util.trim_self_link(pg["region"]))
+        pg["name"]: lookup().compute.resourcePolicies().delete(
+            project=lookup().project,
+            region=util.trim_self_link(pg["region"]),
+            resourcePolicy=pg["name"])
         for pg in placement_groups
     }
 

From e675d5541e94838cc9825bcbb50397d70c171b32 Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Tue, 17 Dec 2024 23:00:39 +0000
Subject: [PATCH 035/140] Update validation script to skip the new a3u
 blueprint

---
 tools/validate_configs/validate_configs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh
index 996b25006f..a414f2eb33 100755
--- a/tools/validate_configs/validate_configs.sh
+++ b/tools/validate_configs/validate_configs.sh
@@ -120,7 +120,7 @@ check_background() {
 	fi
 }
 
-CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*')
+CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/gke-a3-ultragpu/*')
 cwd=$(pwd)
 NPROCS=${NPROCS:-$(nproc)}
 echo "Running tests in $NPROCS processes"

From bb3640291926ef9de917a716f7b437e2765a407a Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Wed, 18 Dec 2024 00:18:26 +0000
Subject: [PATCH 036/140] resolve linter errors

---
 examples/gke-a3-ultragpu/README.md            |  2 +-
 .../gke-a3-ultragpu-deployment.yaml           | 42 ++++++++++++-------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/examples/gke-a3-ultragpu/README.md b/examples/gke-a3-ultragpu/README.md
index 73b37bbfcb..a7831af889 100644
--- a/examples/gke-a3-ultragpu/README.md
+++ b/examples/gke-a3-ultragpu/README.md
@@ -1 +1 @@
-Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.
\ No newline at end of file
+Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
index b7a8d24071..0e475ec2d6 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
@@ -1,16 +1,30 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 ---
-    terraform_backend_defaults:
-      type: gcs
-      configuration:
-        bucket: BUCKET_NAME
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket: BUCKET_NAME
 
-    vars:
-      deployment_name: gke-a3-ultra
-      project_id: PROJECT_ID
-      region: COMPUTE_REGION
-      zone: COMPUTE_ZONE
-      authorized_cidr: <IP_ADDRESS>/<SUFFIX>
-      # In order to not target a BLOCK_NAME, extended_reservation can be inputed as 
-      # extended_reservation: RESERVATION_NAME
-      extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
-      static_node_count: NODE_COUNT
+vars:
+  deployment_name: gke-a3-ultra
+  project_id: PROJECT_ID
+  region: COMPUTE_REGION
+  zone: COMPUTE_ZONE
+  authorized_cidr: <IP_ADDRESS>/<SUFFIX>
+  # In order to not target a BLOCK_NAME, extended_reservation can be inputted as
+  # extended_reservation: RESERVATION_NAME
+  extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
+  static_node_count: NODE_COUNT

From 9f505af82e0bf84ed069edef7589b4fc1ad3f169 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 18 Dec 2024 11:57:40 +0000
Subject: [PATCH 037/140] Update terraform provider from google-beta to beta
 for parallelstore

---
 modules/file-system/parallelstore/README.md   | 6 +++---
 modules/file-system/parallelstore/main.tf     | 1 -
 modules/file-system/parallelstore/versions.tf | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index 46f0969b93..745affcc64 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -114,7 +114,7 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13 |
-| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 5.25.0 |
+| <a name="requirement_google"></a> [google](#requirement\_google) | >= 6.13.0 |
 | <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
 | <a name="requirement_random"></a> [random](#requirement\_random) | ~> 3.0 |
 
@@ -122,7 +122,7 @@ limitations under the License.
 
 | Name | Version |
 |------|---------|
-| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 5.25.0 |
+| <a name="provider_google"></a> [google](#provider\_google) | >= 6.13.0 |
 | <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
 | <a name="provider_random"></a> [random](#provider\_random) | ~> 3.0 |
 
@@ -134,7 +134,7 @@ No modules.
 
 | Name | Type |
 |------|------|
-| [google-beta_google_parallelstore_instance.instance](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_parallelstore_instance) | resource |
+| [google_parallelstore_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/parallelstore_instance) | resource |
 | [null_resource.hydration](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
 
diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index 3de3b94f3a..acf3b07beb 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -54,7 +54,6 @@ resource "google_parallelstore_instance" "instance" {
 
   labels = local.labels
 
-  provider   = google-beta
   depends_on = [var.private_vpc_connection_peering]
 }
 
diff --git a/modules/file-system/parallelstore/versions.tf b/modules/file-system/parallelstore/versions.tf
index 24069a479c..174b5281e4 100644
--- a/modules/file-system/parallelstore/versions.tf
+++ b/modules/file-system/parallelstore/versions.tf
@@ -18,9 +18,9 @@ terraform {
   required_version = ">= 0.13"
 
   required_providers {
-    google-beta = {
-      source  = "hashicorp/google-beta"
-      version = ">= 5.25.0"
+    google = {
+      source  = "hashicorp/google"
+      version = ">= 6.13.0"
     }
 
     random = {

From 7d8061d5607531eb4f2b80ad76c24a5fb4de490c Mon Sep 17 00:00:00 2001
From: Rachael Tamakloe <rtamakloe@google.com>
Date: Tue, 17 Dec 2024 01:07:06 +0000
Subject: [PATCH 038/140] Adding integration test for ansible os coverage

---
 .../startup-script/files/install_ansible.sh   |   5 +-
 .../test-validation/test-ansible-vm.yml       |  24 ++++
 .../daily-tests/blueprints/ansible-vm.yaml    | 115 ++++++++++++++++++
 .../daily-tests/builds/ansible-vm.yaml        |  41 +++++++
 .../daily-tests/tests/ansible-vm.yml          |  25 ++++
 5 files changed, 208 insertions(+), 2 deletions(-)
 create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ansible-vm.yml
 create mode 100644 tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
 create mode 100644 tools/cloud-build/daily-tests/builds/ansible-vm.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/ansible-vm.yml

diff --git a/modules/scripts/startup-script/files/install_ansible.sh b/modules/scripts/startup-script/files/install_ansible.sh
index 41c483307f..7146ecea36 100644
--- a/modules/scripts/startup-script/files/install_ansible.sh
+++ b/modules/scripts/startup-script/files/install_ansible.sh
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+set -e
 REQ_ANSIBLE_VERSION=2.11
 REQ_ANSIBLE_PIP_VERSION=4.10.0
 REQ_PIP_WHEEL_VERSION=0.37.1
@@ -197,13 +198,13 @@ main() {
 	fi
 
 	# upgrade wheel if necessary
-	wheel_pkg=$(${venv_python_path} -m pip list --format=freeze | grep "^wheel")
+	wheel_pkg=$(${venv_python_path} -m pip list --format=freeze | grep "^wheel" || true)
 	if [ "$wheel_pkg" != "wheel==${REQ_PIP_WHEEL_VERSION}" ]; then
 		${venv_python_path} -m pip install -U wheel==${REQ_PIP_WHEEL_VERSION}
 	fi
 
 	# upgrade setuptools if necessary
-	setuptools_pkg=$(${venv_python_path} -m pip list --format=freeze | grep "^setuptools")
+	setuptools_pkg=$(${venv_python_path} -m pip list --format=freeze | grep "^setuptools" || true)
 	if [ "$setuptools_pkg" != "setuptools==${REQ_PIP_SETUPTOOLS_VERSION}" ]; then
 		${venv_python_path} -m pip install -U setuptools==${REQ_PIP_SETUPTOOLS_VERSION}
 	fi
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ansible-vm.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ansible-vm.yml
new file mode 100644
index 0000000000..da73958dd5
--- /dev/null
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ansible-vm.yml
@@ -0,0 +1,24 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+- name: Check if Ansible is installed
+  ansible.builtin.shell: |
+    command -v ansible >/dev/null 2>&1 && echo "Ansible is installed" || echo "Ansible is not installed"
+  register: ansible_check_result
+
+- name: Assert Ansible is installed
+  ansible.builtin.assert:
+    that:
+    - ansible_check_result.stdout == "Ansible is installed"
diff --git a/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml b/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
new file mode 100644
index 0000000000..2b5ac93131
--- /dev/null
+++ b/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
@@ -0,0 +1,115 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: test-workstation-ansible
+
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: test-workstation-ansible
+  region: us-central1
+  zone: us-central1-a
+  machine_type: n2-standard-2
+  disk_type: pd-ssd
+  instance_count: 1
+
+deployment_groups:
+- group: primary
+  modules:
+
+  - id: network1
+    source: modules/network/pre-existing-vpc
+
+  - id: startup-script
+    source: modules/scripts/startup-script
+    settings:
+      install_ansible: true
+      runners:
+      - type: shell
+        destination: startup.sh
+        content: |
+          #!/bin/bash
+          set -ex
+          echo \$(ansible --version)
+
+  - id: workstation-centos
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: centos
+      add_deployment_name_before_prefix: true
+      instance_image:
+        name: centos-7-v20240611
+        project: centos-cloud
+
+  - id: workstation-ubuntu-2004
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: ubuntu2004
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: ubuntu-2004-lts
+        project: ubuntu-os-cloud
+
+  - id: workstation-ubuntu-2204
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: ubuntu2204
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: ubuntu-2204-lts
+        project: ubuntu-os-cloud
+
+  - id: workstation-debian
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: debian
+      instance_image:
+        family: debian-11
+        project: debian-cloud
+
+  - id: workstation-rocky-8
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: rocky8
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: rocky-linux-8-optimized-gcp
+        project: rocky-linux-cloud
+
+  - id: wait-for-startup
+    source: community/modules/scripts/wait-for-startup
+    settings:
+      instance_names:
+      - $(workstation-centos.name[0])
+      - $(workstation-ubuntu-2004.name[0])
+      - $(workstation-ubuntu-2204.name[0])
+      - $(workstation-debian.name[0])
+      - $(workstation-rocky-8.name[0])
+      timeout: 7200
diff --git a/tools/cloud-build/daily-tests/builds/ansible-vm.yaml b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml
new file mode 100644
index 0000000000..a3aba07522
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml
@@ -0,0 +1,41 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.pre-existing-vpc
+- m.startup-script
+- m.vm-instance
+- m.wait-for-startup
+- vm
+
+timeout: 14400s  # 4hr
+steps:
+- id: anisble-vm
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+      --extra-vars="@tools/cloud-build/daily-tests/tests/ansible-vm.yml"
diff --git a/tools/cloud-build/daily-tests/tests/ansible-vm.yml b/tools/cloud-build/daily-tests/tests/ansible-vm.yml
new file mode 100644
index 0000000000..39f773d544
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/ansible-vm.yml
@@ -0,0 +1,25 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+test_name: test-workstation-ansible
+deployment_name: "ansible-vm-{{ build }}"
+zone: us-central1-a
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml"
+network: "default"
+remote_node: "{{ deployment_name }}-centos-0"
+post_deploy_tests:
+- test-validation/test-ansible-vm.yml

From 2479b7f7bb42879b12ef42be51278ed170eb6e0d Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Wed, 18 Dec 2024 06:18:04 +0000
Subject: [PATCH 039/140] add reservations for kueue integration tests

---
 .../daily-tests/blueprints/gke-a2-highgpu.yaml         |  3 ++-
 .../daily-tests/tests/gke-a2-highgpu-kueue.yml         | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml b/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
index a09e180732..4c77cf0bcb 100644
--- a/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml
@@ -20,7 +20,7 @@ vars:
   project_id: hpc-toolkit-dev ## Set GCP Project ID Here ##
   deployment_name: gke-a2-highgpu
   region: us-central1
-  zone: us-central1-c
+  zone: us-central1-f
 
   # Cidr block containing the IP of the machine calling terraform.
   # The following line must be updated for this example to work.
@@ -84,6 +84,7 @@ deployment_groups:
       zones: [$(vars.zone)]
       image_type: UBUNTU_CONTAINERD
       placement_policy:
+        name: a2-highgpu-compact
         type: "COMPACT"
     outputs: [instructions]
 
diff --git a/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml b/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml
index 549fbac367..0735f4f970 100644
--- a/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml
+++ b/tools/cloud-build/daily-tests/tests/gke-a2-highgpu-kueue.yml
@@ -22,13 +22,19 @@ workspace: /workspace
 blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/gke-a2-highgpu.yaml"
 network: "gke-a2high-net-{{ build }}"
 region: us-central1
-zone: us-central1-c
+zone: us-central1-f
 remote_node: "{{ deployment_name }}-remote-node-0"
+reservation_affinity:
+  consume_reservation_type: SPECIFIC_RESERVATION
+  specific_reservations:
+  - name: a2-reservation-0
+    project: "{{ project }}"
 cli_deployment_vars:
   region: "{{ region }}"
   zone: "{{ zone }}"
   network_name: "{{ network }}"
-  local_ssd_count_nvme_block: 8
+  reservation_affinity: "{{ reservation_affinity }}"
+  local_ssd_count_nvme_block: 2
 custom_vars:
   project: "{{ project }}"
 post_deploy_tests:

From 6534ee37ec652325a3335437166288cb99cb2ecd Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Wed, 18 Dec 2024 17:06:05 +0000
Subject: [PATCH 040/140] Updated gke cluster module source path check

---
 pkg/config/expand.go      | 2 +-
 pkg/config/expand_test.go | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index 1005eb780f..87a8fac9fa 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -189,7 +189,7 @@ func (bp Blueprint) expandBackend(grp *Group) {
 
 func kubectlProviderRequiredModule(grp *Group) (bool, Module) {
 	for _, mod := range grp.Modules {
-		if strings.Contains(mod.Source, "gke-cluster") || strings.Contains(mod.Source, "pre-existing-gke-cluster") {
+		if strings.Contains(mod.Source, "modules/scheduler/gke-cluster") || strings.Contains(mod.Source, "modules/scheduler/pre-existing-gke-cluster") {
 			return true, mod
 		}
 	}
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index f9f273efd8..bafb967eeb 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -106,11 +106,11 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		Configuration: testKubectlConf}
 
 	testGKEClusterModule := Module{
-		Source: "module/test/gke-cluster/dummy",
+		Source: "modules/scheduler/gke-cluster",
 		ID:     testGKEClusterModuleID}
 
 	testPreExistingGKEClusterModule := Module{
-		Source: "module/test/pre-existing-gke-cluster/dummy",
+		Source: "modules/scheduler/pre-existing-gke-cluster",
 		ID:     testGKEClusterModuleID}
 
 	defaultProvider := map[string]PR{

From 86ec68ff3fd7b94c456fca84823b753411263e58 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Fri, 13 Dec 2024 06:00:29 +0000
Subject: [PATCH 041/140] SlurmGCP. Move TPU code into separate file +
 refactoring

---
 .../modules/slurm_files/scripts/conf.py       |   3 +-
 .../slurm_files/scripts/get_tpu_vmcount.py    |   6 +-
 .../modules/slurm_files/scripts/resume.py     |  43 +--
 .../modules/slurm_files/scripts/slurmsync.py  |  27 +-
 .../modules/slurm_files/scripts/suspend.py    |  41 +--
 .../slurm_files/scripts/tests/test_resume.py  |   6 +-
 .../scripts/tests/test_topology.py            |  10 +-
 .../modules/slurm_files/scripts/tpu.py        | 329 ++++++++++++++++++
 .../modules/slurm_files/scripts/util.py       | 254 --------------
 9 files changed, 367 insertions(+), 352 deletions(-)
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tpu.py

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index 4af58a7831..a4ff1e488a 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 import util
 from util import dirs, slurmdirs
+import tpu
 
 FILE_PREAMBLE = """
 # Warning:
@@ -519,7 +520,7 @@ def _walk(
 
 
 def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup):
-    tpuobj = util.TPU(nodeset)
+    tpuobj = tpu.TPU.make(nodeset.nodeset_name, lkp)
     static, dynamic = lkp.nodenames(nodeset)
 
     pref = ["tpu-root",  f"ns_{nodeset.nodeset_name}"]
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py
index 1557d6020b..1e194426fd 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py
@@ -16,12 +16,14 @@
 
 import argparse
 import util
+import tpu
 
 
 def get_vmcount_of_tpu_part(part):
     res = 0
-    for ns in util.lookup().cfg.partitions[part].partition_nodeset_tpu:
-        tpu_obj = util.TPU(util.lookup().cfg.nodeset_tpu[ns])
+    lkp = util.lookup()
+    for ns in lkp.cfg.partitions[part].partition_nodeset_tpu:
+        tpu_obj = tpu.TPU.make(ns, lkp)
         if res == 0:
             res = tpu_obj.vmcount
         else:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 5d88751a41..3a4fa74dc9 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -42,6 +42,7 @@
     wait_for_operation,
 )
 from util import lookup, NSDict
+import tpu
 
 import slurm_gcp_plugins
 
@@ -269,7 +270,8 @@ def group_nodes_bulk(nodes: List[str], resume_data: Optional[ResumeData], lkp: u
     def chunk_nodes(nodes: List[str]):
         chunk_size = BULK_INSERT_LIMIT
         if nodes and lkp.node_is_tpu(nodes[0]):
-            chunk_size = util.TPU(lkp.node_nodeset(nodes[0])).vmcount
+            ns = lkp.node_nodeset_name(nodes[0])
+            chunk_size = tpu.TPU.make(ns, lkp).vmcount
         return chunked(nodes, n=chunk_size)
     
     chunks = [
@@ -287,34 +289,6 @@ def chunk_nodes(nodes: List[str]):
     return {chunk.name: chunk for chunk in chunks}
 
 
-def start_tpu(data):
-    tpu = data["tpu"]
-    node = data["node"]
-    if len(node) == 1:
-        node = node[0]
-        log.debug(
-            f"Will create a TPU of type {tpu.node_type} tf_version {tpu.tf_version} in zone {tpu.zone} with name {node}"
-        )
-        tpunode = tpu.get_node(node)
-        if tpunode is None:
-            if not tpu.create_node(nodename=node):
-                log.error("Error creating tpu node {node}")
-        else:
-            if tpu.preserve_tpu:
-                if not tpu.start_node(nodename=node):
-                    log.error("Error starting tpu node {node}")
-            else:
-                log.info(
-                    f"Tpu node {node} is already created, but will not start it because nodeset does not have preserve_tpu option active."
-                )
-    else:
-        log.debug(
-            f"Will create a multi-vm TPU of type {tpu.node_type} tf_version {tpu.tf_version} in zone {tpu.zone} with name {node[0]}"
-        )
-        if not tpu.create_node(nodename=node):
-            log.error("Error creating tpu node {node}")
-
-
 def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
     """resume nodes in nodelist"""
     # Prevent dormant nodes associated with a future reservation from being resumed
@@ -339,17 +313,13 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
             "node bulk groups: \n{}".format(yaml.safe_dump(grouped_nodelists).rstrip())
         )
 
-    tpu_start_data = []
-    tpu_objs = {}
+    tpu_chunks = []
     bi_inserts = {}
 
     for group, chunk in grouped_nodes.items():
         model = chunk.nodes[0]
         if lookup().node_is_tpu(model):
-            # do not create multiple tpu_objs if nodes with the same prefix are used
-            if chunk.prefix not in tpu_objs.keys():
-                tpu_objs[chunk.prefix] = util.TPU(lookup().node_nodeset(model))
-            tpu_start_data.append({"tpu": tpu_objs[chunk.prefix], "node": chunk.nodes})
+            tpu_chunks.append(chunk.nodes)
         else:
             bi_inserts[group] = create_instances_request(
                 chunk.nodes, chunk.placement_group, chunk.excl_job_id
@@ -384,8 +354,7 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
     bulk_operations = {group: wait_for_operation(op) for group, op in started.items()}
 
     # Start TPU after regular nodes so that regular nodes are not affected by the slower TPU nodes
-    log.debug(f"tpu_start_data={yaml.safe_dump(tpu_start_data)}")
-    execute_with_futures(start_tpu, tpu_start_data)
+    execute_with_futures(tpu.start_tpu, tpu_chunks)
 
     all_successful_inserts = []
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index b06d093e78..65bf15ede5 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -22,7 +22,6 @@
 import sys
 import shlex
 from datetime import datetime, timedelta
-from enum import Enum
 from itertools import chain
 from pathlib import Path
 from dataclasses import dataclass
@@ -40,13 +39,12 @@
     separate,
     to_hostlist,
     NodeState,
-    TPU,
     chunked,
     dirs,
 )
 from util import lookup
 from suspend import delete_instances
-from resume import start_tpu
+import tpu
 import conf
 
 log = logging.getLogger()
@@ -130,18 +128,19 @@ def start_instance_op(inst):
 
 def start_instances(node_list):
     log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list)))
-
-    normal, tpu_nodes = separate(lookup().node_is_tpu, node_list)
+    lkp = lookup()
+    # TODO: use code from resume.py to assign proper placement
+    normal, tpu_nodes = separate(lkp.node_is_tpu, node_list)
     ops = {inst: start_instance_op(inst) for inst in normal}
 
     done, failed = batch_execute(ops)
 
     tpu_start_data = []
-    for ns, nodes in util.groupby_unsorted(tpu_nodes, lookup().node_nodeset_name):
-        tpuobj = TPU(lookup().cfg.nodeset_tpu[ns])
+    for ns, nodes in util.groupby_unsorted(tpu_nodes, lkp.node_nodeset_name):
+        tpuobj = tpu.TPU.make(ns, lkp)
         for snodes in chunked(nodes, n=tpuobj.vmcount):
             tpu_start_data.append({"tpu": tpuobj, "node": snodes})
-    execute_with_futures(start_tpu, tpu_start_data)
+    execute_with_futures(tpu.start_tpu, tpu_start_data)
 
 
 def _find_dynamic_node_status() -> NodeAction:
@@ -163,14 +162,14 @@ def get_fr_action(fr: FutureReservation, nodename:str, state:NodeState) -> Optio
     return NodeActionDown(reason=msg)
 
 def _find_tpu_node_action(nodename, state) -> NodeAction:
-    ns = lookup().node_nodeset(nodename)
-    tpuobj = TPU(ns)
+    lkp = lookup()
+    tpuobj = tpu.TPU.make(lkp.node_nodeset_name(nodename), lkp)
     inst = tpuobj.get_node(nodename)
     # If we do not find the node but it is from a Tpu that has multiple vms look for the master node
     if inst is None and tpuobj.vmcount > 1:
         # Get the tpu slurm nodelist of the nodes in the same tpu group as nodename
         nodelist = run(
-            f"{lookup().scontrol} show topo {nodename}"
+            f"{lkp.scontrol} show topo {nodename}"
             + " | awk -F'=' '/Level=0/ { print $NF }'",
             shell=True,
         ).stdout
@@ -200,13 +199,13 @@ def _find_tpu_node_action(nodename, state) -> NodeAction:
             & state.flags
         ):
             return NodeActionDown(reason="Unbacked instance")
-        if lookup().is_static_node(nodename):
+        if lkp.is_static_node(nodename):
             return NodeActionPowerUp()
     elif (
         state is not None
         and "POWERED_DOWN" not in state.flags
         and "POWERING_DOWN" not in state.flags
-        and inst.state == TPU.State.STOPPED
+        and inst.state == tpu.TPU.State.STOPPED
     ):
         if tpuobj.preemptible:
             return NodeActionPrempt()
@@ -214,7 +213,7 @@ def _find_tpu_node_action(nodename, state) -> NodeAction:
             return NodeActionDown(reason="Instance terminated")
     elif (
         state is None or "POWERED_DOWN" in state.flags
-    ) and inst.state == TPU.State.READY:
+    ) and inst.state == tpu.TPU.State.READY:
         return NodeActionDelete()
     elif state is None:
         # if state is None here, the instance exists but it's not in Slurm
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
index f01013e1a2..7d6ae28f9f 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py
@@ -27,9 +27,9 @@
     to_hostlist,
     wait_for_operations,
     separate,
-    execute_with_futures,
 )
-from util import lookup, TPU
+from util import lookup
+import tpu
 
 import slurm_gcp_plugins
 
@@ -58,33 +58,6 @@ def delete_instance_request(instance):
     return request
 
 
-def stop_tpu(data):
-    tpu_nodeset = data["nodeset"]
-    node = data["node"]
-    tpu = data["tpu"]
-    if tpu_nodeset.preserve_tpu and tpu.vmcount == 1:
-        log.info(f"stopping node {node}")
-        if tpu.stop_node(node):
-            return
-        log.error("Error stopping node {node} will delete instead")
-    log.info(f"deleting node {node}")
-    if not tpu.delete_node(node):
-        log.error("Error deleting node {node}")
-
-
-def delete_tpu_instances(instances):
-    stop_data = []
-    for prefix, nodes in util.groupby_unsorted(instances, lookup().node_prefix):
-        log.info(f"Deleting TPU nodes from prefix {prefix}")
-        lnodes = list(nodes)
-        tpu_nodeset = lookup().node_nodeset(lnodes[0])
-        tpu = TPU(tpu_nodeset)
-        stop_data.extend(
-            [{"tpu": tpu, "node": node, "nodeset": tpu_nodeset} for node in lnodes]
-        )
-    execute_with_futures(stop_tpu, stop_data)
-
-
 def delete_instances(instances):
     """delete instances individually"""
     invalid, valid = separate(lambda inst: bool(lookup().instance(inst)), instances)
@@ -106,15 +79,11 @@ def delete_instances(instances):
 
 
 def suspend_nodes(nodes: List[str]) -> None:
-    tpu_nodes, other_nodes = [], []
-    for node in nodes[:]:
-        if lookup().node_is_tpu(node):
-            tpu_nodes.append(node)
-        else:
-            other_nodes.append(node)
+    lkp = lookup()
+    other_nodes, tpu_nodes = util.separate(lkp.node_is_tpu, nodes)
 
     delete_instances(other_nodes)
-    delete_tpu_instances(tpu_nodes)
+    tpu.delete_tpu_instances(tpu_nodes)
 
 
 def main(nodelist):
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
index 147ba00658..3c637bbe10 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
@@ -63,7 +63,7 @@ def test_get_resume_file_data():
       mock_to_hostnames.assert_called_once_with("green-[0-2]")
 
 
-@unittest.mock.patch("util.TPU")
+@unittest.mock.patch("tpu.TPU.make")
 @unittest.mock.patch("resume.create_placements")
 def test_group_nodes_bulk(mock_create_placements, mock_tpu):
   cfg = TstCfg(
@@ -106,8 +106,8 @@ def mock_create_placements_se(nodes, excl_job_id, lkp):
     raise AssertionError(f"unexpected invocation: '{args}'")
   mock_create_placements.side_effect = mock_create_placements_se
 
-  def mock_tpu_se(ns: TstNodeset) -> TstTPU:
-    if ns.nodeset_name == "t":
+  def mock_tpu_se(ns: str, lkp) -> TstTPU:
+    if ns == "t":
       return TstTPU(vmcount=2)
     raise AssertionError(f"unexpected invocation: '{ns}'")
   mock_tpu.side_effect = mock_tpu_se
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py
index 78715bc5f6..a3680f15d7 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py
@@ -35,7 +35,7 @@ def test_gen_topology_conf_empty():
     assert open(cfg.output_dir + "/cloud_topology.conf").read() == PRELUDE + "\n"
 
 
-@mock.patch("util.TPU")
+@mock.patch("tpu.TPU.make")
 def test_gen_topology_conf(tpu_mock):
     cfg = TstCfg(
         nodeset_tpu={
@@ -50,12 +50,12 @@ def test_gen_topology_conf(tpu_mock):
         output_dir=tempfile.mkdtemp(),
     )
 
-    def tpu_se(ns: TstNodeset) -> TstTPU:
-        if ns.nodeset_name == "bold":
+    def tpu_se(ns: str, lkp) -> TstTPU:
+        if ns == "bold":
             return TstTPU(vmcount=3)
-        if ns.nodeset_name == "slim":
+        if ns == "slim":
             return TstTPU(vmcount=1)
-        raise AssertionError(f"unexpected TPU name: '{ns.nodeset_name}'")
+        raise AssertionError(f"unexpected TPU name: '{ns}'")
 
     tpu_mock.side_effect = tpu_se
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tpu.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tpu.py
new file mode 100644
index 0000000000..d8632652f8
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tpu.py
@@ -0,0 +1,329 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import socket
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+import yaml
+
+import util
+from util import create_client_options, ApiEndpoint
+
+from google.cloud import tpu_v2 as tpu  # noqa: E402
+import google.api_core.exceptions as gExceptions  # noqa: E402
+
+log = logging.getLogger()
+
+_tpu_cache = {}
+
+class TPU:
+    """Class for handling the TPU-vm nodes"""
+
+    State = tpu.types.cloud_tpu.Node.State
+    TPUS_PER_VM = 4
+    __expected_states = {
+        "create": State.READY,
+        "start": State.READY,
+        "stop": State.STOPPED,
+    }
+
+    __tpu_version_mapping = {
+        "V2": tpu.AcceleratorConfig().Type.V2,
+        "V3": tpu.AcceleratorConfig().Type.V3,
+        "V4": tpu.AcceleratorConfig().Type.V4,
+    }
+
+    @classmethod
+    def make(cls, nodeset_name: str, lkp: util.Lookup) -> "TPU":
+        key = (id(lkp), nodeset_name)
+        if key not in _tpu_cache:
+          nodeset = lkp.cfg.nodeset_tpu[nodeset_name]
+          _tpu_cache[key] = cls(nodeset, lkp)
+        return _tpu_cache[key]
+        
+
+    def __init__(self, nodeset: object, lkp: util.Lookup):
+        self._nodeset = nodeset
+        self.lkp = lkp
+        self._parent = f"projects/{lkp.project}/locations/{nodeset.zone}"
+        co = create_client_options(ApiEndpoint.TPU)
+        self._client = tpu.TpuClient(client_options=co)
+        self.data_disks = []
+        for data_disk in nodeset.data_disks:
+            ad = tpu.AttachedDisk()
+            ad.source_disk = data_disk
+            ad.mode = tpu.AttachedDisk.DiskMode.DISK_MODE_UNSPECIFIED
+            self.data_disks.append(ad)
+        ns_ac = nodeset.accelerator_config
+        if ns_ac.topology != "" and ns_ac.version != "":
+            ac = tpu.AcceleratorConfig()
+            ac.topology = ns_ac.topology
+            ac.type_ = self.__tpu_version_mapping[ns_ac.version]
+            self.ac = ac
+        else:
+            req = tpu.GetAcceleratorTypeRequest(
+                name=f"{self._parent}/acceleratorTypes/{nodeset.node_type}"
+            )
+            self.ac = self._client.get_accelerator_type(req).accelerator_configs[0]
+        self.vmcount = self.__calc_vm_from_topology(self.ac.topology)
+
+    @property
+    def nodeset(self):
+        return self._nodeset
+
+    @property
+    def preserve_tpu(self):
+        return self._nodeset.preserve_tpu
+
+    @property
+    def node_type(self):
+        return self._nodeset.node_type
+
+    @property
+    def tf_version(self):
+        return self._nodeset.tf_version
+
+    @property
+    def enable_public_ip(self):
+        return self._nodeset.enable_public_ip
+
+    @property
+    def preemptible(self):
+        return self._nodeset.preemptible
+
+    @property
+    def reserved(self):
+        return self._nodeset.reserved
+
+    @property
+    def service_account(self):
+        return self._nodeset.service_account
+
+    @property
+    def zone(self):
+        return self._nodeset.zone
+
+    def check_node_type(self):
+        if self.node_type is None:
+            return False
+        try:
+            request = tpu.GetAcceleratorTypeRequest(
+                name=f"{self._parent}/acceleratorTypes/{self.node_type}"
+            )
+            return self._client.get_accelerator_type(request=request) is not None
+        except Exception:
+            return False
+
+    def check_tf_version(self):
+        try:
+            request = tpu.GetRuntimeVersionRequest(
+                name=f"{self._parent}/runtimeVersions/{self.tf_version}"
+            )
+            return self._client.get_runtime_version(request=request) is not None
+        except Exception:
+            return False
+
+    def __calc_vm_from_topology(self, topology):
+        topo = topology.split("x")
+        tot = 1
+        for num in topo:
+            tot = tot * int(num)
+        return tot // self.TPUS_PER_VM
+
+    def __check_resp(self, response, op_name):
+        des_state = self.__expected_states.get(op_name)
+        # If the state is not in the table just print the response
+        if des_state is None:
+            return False
+        if response.__class__.__name__ != "Node":  # If the response is not a node fail
+            return False
+        if response.state == des_state:
+            return True
+        return False
+
+    def list_nodes(self):
+        try:
+            request = tpu.ListNodesRequest(parent=self._parent)
+            res = self._client.list_nodes(request=request)
+        except gExceptions.NotFound:
+            res = None
+        return res
+
+    def list_node_names(self):
+        return [node.name.split("/")[-1] for node in self.list_nodes()]
+
+    def start_node(self, nodename):
+        request = tpu.StartNodeRequest(name=f"{self._parent}/nodes/{nodename}")
+        resp = self._client.start_node(request=request).result()
+        return self.__check_resp(resp, "start")
+
+    def stop_node(self, nodename):
+        request = tpu.StopNodeRequest(name=f"{self._parent}/nodes/{nodename}")
+        resp = self._client.stop_node(request=request).result()
+        return self.__check_resp(resp, "stop")
+
+    def get_node(self, nodename):
+        try:
+            request = tpu.GetNodeRequest(name=f"{self._parent}/nodes/{nodename}")
+            res = self._client.get_node(request=request)
+        except gExceptions.NotFound:
+            res = None
+        return res
+
+    def _register_node(self, nodename, ip_addr):
+        dns_name = socket.getnameinfo((ip_addr, 0), 0)[0]
+        util.run(
+            f"{self.lkp.scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}"
+        )
+
+    def create_node(self, nodename):
+        if self.vmcount > 1 and not isinstance(nodename, list):
+            log.error(
+                f"Tried to create a {self.vmcount} node TPU on nodeset {self._nodeset.nodeset_name} but only received one nodename {nodename}"
+            )
+            return False
+        if self.vmcount > 1 and (
+            isinstance(nodename, list) and len(nodename) != self.vmcount
+        ):
+            log.error(
+                f"Expected to receive a list of {self.vmcount} nodenames for TPU node creation in nodeset {self._nodeset.nodeset_name}, but received this list {nodename}"
+            )
+            return False
+
+        node = tpu.Node()
+        node.accelerator_config = self.ac
+        node.runtime_version = f"tpu-vm-tf-{self.tf_version}"
+        startup_script = """
+        #!/bin/bash
+        echo "startup script not found > /var/log/startup_error.log"
+        """
+        with open(
+            Path(self.lkp.cfg.slurm_scripts_dir or util.dirs.scripts) / "startup.sh", "r"
+        ) as script:
+            startup_script = script.read()
+        if isinstance(nodename, list):
+            node_id = nodename[0]
+            slurm_names = []
+            wid = 0
+            for node_wid in nodename:
+                slurm_names.append(f"WORKER_{wid}:{node_wid}")
+                wid += 1
+        else:
+            node_id = nodename
+            slurm_names = [f"WORKER_0:{nodename}"]
+        node.metadata = {
+            "slurm_docker_image": self.nodeset.docker_image,
+            "startup-script": startup_script,
+            "slurm_instance_role": "compute",
+            "slurm_cluster_name": self.lkp.cfg.slurm_cluster_name,
+            "slurm_bucket_path": self.lkp.cfg.bucket_path,
+            "slurm_names": ";".join(slurm_names),
+            "universe_domain": util.universe_domain(),
+        }
+        node.tags = [self.lkp.cfg.slurm_cluster_name]
+        if self.nodeset.service_account:
+            node.service_account.email = self.nodeset.service_account.email
+            node.service_account.scope = self.nodeset.service_account.scopes
+        node.scheduling_config.preemptible = self.preemptible
+        node.scheduling_config.reserved = self.reserved
+        node.network_config.subnetwork = self.nodeset.subnetwork
+        node.network_config.enable_external_ips = self.enable_public_ip
+        if self.data_disks:
+            node.data_disks = self.data_disks
+
+        request = tpu.CreateNodeRequest(parent=self._parent, node=node, node_id=node_id)
+        resp = self._client.create_node(request=request).result()
+        if not self.__check_resp(resp, "create"):
+            return False
+        if isinstance(nodename, list):
+            for node_id, net_endpoint in zip(nodename, resp.network_endpoints):
+                self._register_node(node_id, net_endpoint.ip_address)
+        else:
+            ip_add = resp.network_endpoints[0].ip_address
+            self._register_node(nodename, ip_add)
+        return True
+
+    def delete_node(self, nodename):
+        request = tpu.DeleteNodeRequest(name=f"{self._parent}/nodes/{nodename}")
+        try:
+            resp = self._client.delete_node(request=request).result()
+            if resp:
+                return self.get_node(nodename=nodename) is None
+            return False
+        except gExceptions.NotFound:
+            # log only error if vmcount is 1 as for other tpu vm count, this could be "phantom" nodes
+            if self.vmcount == 1:
+                log.error(f"Tpu single node {nodename} not found")
+            else:
+                # for the TPU nodes that consist in more than one vm, only the first node of the TPU a.k.a. the master node will
+                # exist as real TPU nodes, so the other ones are expected to not be found, check the hostname of the node that has
+                # not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence
+                # log an error
+                nodehostname = yaml.safe_load(
+                    util.run(f"{self.lkp.scontrol} --yaml show node {nodename}").stdout.rstrip()
+                )["nodes"][0]["hostname"]
+                if nodehostname.split("-")[-1] == "0":
+                    log.error(f"TPU master node {nodename} not found")
+                else:
+                    log.info(f"Deleted TPU 'phantom' node {nodename}")
+            # If the node is not found it is tecnichally deleted, so return success.
+            return True
+
+def _stop_tpu(node: str) -> None:
+    lkp = util.lookup()
+    tpuobj = TPU.make(lkp.node_nodeset_name(node), lkp)
+    if tpuobj.nodeset.preserve_tpu and tpuobj.vmcount == 1:
+        log.info(f"stopping node {node}")
+        if tpuobj.stop_node(node):
+            return
+        log.error("Error stopping node {node} will delete instead")
+    log.info(f"deleting node {node}")
+    if not tpuobj.delete_node(node):
+        log.error("Error deleting node {node}")
+
+
+def delete_tpu_instances(instances: List[str]) -> None:
+    util.execute_with_futures(_stop_tpu, instances)
+
+
+def start_tpu(node: List[str]):
+    lkp = util.lookup()
+    tpuobj = TPU.make(lkp.node_nodeset_name(node[0]), lkp)
+
+    if len(node) == 1:
+        node = node[0]
+        log.debug(
+            f"Will create a TPU of type {tpuobj.node_type} tf_version {tpuobj.tf_version} in zone {tpuobj.zone} with name {node}"
+        )
+        tpunode = tpuobj.get_node(node)
+        if tpunode is None:
+            if not tpuobj.create_node(nodename=node):
+                log.error("Error creating tpu node {node}")
+        else:
+            if tpuobj.preserve_tpu:
+                if not tpuobj.start_node(nodename=node):
+                    log.error("Error starting tpu node {node}")
+            else:
+                log.info(
+                    f"Tpu node {node} is already created, but will not start it because nodeset does not have preserve_tpu option active."
+                )
+    else:
+        log.debug(
+            f"Will create a multi-vm TPU of type {tpuobj.node_type} tf_version {tpuobj.tf_version} in zone {tpuobj.zone} with name {node[0]}"
+        )
+        if not tpuobj.create_node(nodename=node):
+            log.error("Error creating tpu node {node}")
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 017443002f..62fafde6e1 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -56,13 +56,6 @@
 from google.api_core.client_options import ClientOptions  # noqa: E402
 import httplib2  # noqa: E402
 
-try:
-    from google.cloud import tpu_v2 as tpu  # noqa: E402
-    can_tpu = True
-except ImportError: # TODO: remove once CentOS 7 is deprecated or dependency is added
-    f"WARNING: Missing Python module 'google.cloud.tpu_v2 (pip:google-cloud-tpu)', TPU support will not work."
-    can_tpu = False
-
 import google.api_core.exceptions as gExceptions  # noqa: E402
 
 from requests import get as get_url  # noqa: E402
@@ -1189,253 +1182,6 @@ def represent_path(dumper, path):
         return dumper.represent_scalar("tag:yaml.org,2002:str", str(path))
 
 
-class TPU:
-    """Class for handling the TPU-vm nodes"""
-
-    if can_tpu:
-        State = tpu.types.cloud_tpu.Node.State
-        TPUS_PER_VM = 4
-        __expected_states = {
-            "create": State.READY,
-            "start": State.READY,
-            "stop": State.STOPPED,
-        }
-
-        __tpu_version_mapping = {
-            "V2": tpu.AcceleratorConfig().Type.V2,
-            "V3": tpu.AcceleratorConfig().Type.V3,
-            "V4": tpu.AcceleratorConfig().Type.V4,
-        }
-
-    def __init__(self, nodeset):
-        if not can_tpu:
-            raise Exception("TPU pip package not installed")
-        self._nodeset = nodeset
-        self._parent = f"projects/{lookup().project}/locations/{nodeset.zone}"
-        co = create_client_options(ApiEndpoint.TPU)
-        self._client = tpu.TpuClient(client_options=co)
-        self.data_disks = []
-        for data_disk in nodeset.data_disks:
-            ad = tpu.AttachedDisk()
-            ad.source_disk = data_disk
-            ad.mode = tpu.AttachedDisk.DiskMode.DISK_MODE_UNSPECIFIED
-            self.data_disks.append(ad)
-        ns_ac = nodeset.accelerator_config
-        if ns_ac.topology != "" and ns_ac.version != "":
-            ac = tpu.AcceleratorConfig()
-            ac.topology = ns_ac.topology
-            ac.type_ = self.__tpu_version_mapping[ns_ac.version]
-            self.ac = ac
-        else:
-            req = tpu.GetAcceleratorTypeRequest(
-                name=f"{self._parent}/acceleratorTypes/{nodeset.node_type}"
-            )
-            self.ac = self._client.get_accelerator_type(req).accelerator_configs[0]
-        self.vmcount = self.__calc_vm_from_topology(self.ac.topology)
-
-    @property
-    def nodeset(self):
-        return self._nodeset
-
-    @property
-    def preserve_tpu(self):
-        return self._nodeset.preserve_tpu
-
-    @property
-    def node_type(self):
-        return self._nodeset.node_type
-
-    @property
-    def tf_version(self):
-        return self._nodeset.tf_version
-
-    @property
-    def enable_public_ip(self):
-        return self._nodeset.enable_public_ip
-
-    @property
-    def preemptible(self):
-        return self._nodeset.preemptible
-
-    @property
-    def reserved(self):
-        return self._nodeset.reserved
-
-    @property
-    def service_account(self):
-        return self._nodeset.service_account
-
-    @property
-    def zone(self):
-        return self._nodeset.zone
-
-    def check_node_type(self):
-        if self.node_type is None:
-            return False
-        try:
-            request = tpu.GetAcceleratorTypeRequest(
-                name=f"{self._parent}/acceleratorTypes/{self.node_type}"
-            )
-            return self._client.get_accelerator_type(request=request) is not None
-        except Exception:
-            return False
-
-    def check_tf_version(self):
-        try:
-            request = tpu.GetRuntimeVersionRequest(
-                name=f"{self._parent}/runtimeVersions/{self.tf_version}"
-            )
-            return self._client.get_runtime_version(request=request) is not None
-        except Exception:
-            return False
-
-    def __calc_vm_from_topology(self, topology):
-        topo = topology.split("x")
-        tot = 1
-        for num in topo:
-            tot = tot * int(num)
-        return tot // self.TPUS_PER_VM
-
-    def __check_resp(self, response, op_name):
-        des_state = self.__expected_states.get(op_name)
-        # If the state is not in the table just print the response
-        if des_state is None:
-            return False
-        if response.__class__.__name__ != "Node":  # If the response is not a node fail
-            return False
-        if response.state == des_state:
-            return True
-        return False
-
-    def list_nodes(self):
-        try:
-            request = tpu.ListNodesRequest(parent=self._parent)
-            res = self._client.list_nodes(request=request)
-        except gExceptions.NotFound:
-            res = None
-        return res
-
-    def list_node_names(self):
-        return [node.name.split("/")[-1] for node in self.list_nodes()]
-
-    def start_node(self, nodename):
-        request = tpu.StartNodeRequest(name=f"{self._parent}/nodes/{nodename}")
-        resp = self._client.start_node(request=request).result()
-        return self.__check_resp(resp, "start")
-
-    def stop_node(self, nodename):
-        request = tpu.StopNodeRequest(name=f"{self._parent}/nodes/{nodename}")
-        resp = self._client.stop_node(request=request).result()
-        return self.__check_resp(resp, "stop")
-
-    def get_node(self, nodename):
-        try:
-            request = tpu.GetNodeRequest(name=f"{self._parent}/nodes/{nodename}")
-            res = self._client.get_node(request=request)
-        except gExceptions.NotFound:
-            res = None
-        return res
-
-    def _register_node(self, nodename, ip_addr):
-        dns_name = socket.getnameinfo((ip_addr, 0), 0)[0]
-        run(
-            f"{lookup().scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}"
-        )
-
-    def create_node(self, nodename):
-        if self.vmcount > 1 and not isinstance(nodename, list):
-            log.error(
-                f"Tried to create a {self.vmcount} node TPU on nodeset {self._nodeset.nodeset_name} but only received one nodename {nodename}"
-            )
-            return False
-        if self.vmcount > 1 and (
-            isinstance(nodename, list) and len(nodename) != self.vmcount
-        ):
-            log.error(
-                f"Expected to receive a list of {self.vmcount} nodenames for TPU node creation in nodeset {self._nodeset.nodeset_name}, but received this list {nodename}"
-            )
-            return False
-
-        node = tpu.Node()
-        node.accelerator_config = self.ac
-        node.runtime_version = f"tpu-vm-tf-{self.tf_version}"
-        startup_script = """
-        #!/bin/bash
-        echo "startup script not found > /var/log/startup_error.log"
-        """
-        with open(
-            Path(lookup().cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r"
-        ) as script:
-            startup_script = script.read()
-        if isinstance(nodename, list):
-            node_id = nodename[0]
-            slurm_names = []
-            wid = 0
-            for node_wid in nodename:
-                slurm_names.append(f"WORKER_{wid}:{node_wid}")
-                wid += 1
-        else:
-            node_id = nodename
-            slurm_names = [f"WORKER_0:{nodename}"]
-        node.metadata = {
-            "slurm_docker_image": self.nodeset.docker_image,
-            "startup-script": startup_script,
-            "slurm_instance_role": "compute",
-            "slurm_cluster_name": lookup().cfg.slurm_cluster_name,
-            "slurm_bucket_path": lookup().cfg.bucket_path,
-            "slurm_names": ";".join(slurm_names),
-            "universe_domain": universe_domain(),
-        }
-        node.tags = [lookup().cfg.slurm_cluster_name]
-        if self.nodeset.service_account:
-            node.service_account.email = self.nodeset.service_account.email
-            node.service_account.scope = self.nodeset.service_account.scopes
-        node.scheduling_config.preemptible = self.preemptible
-        node.scheduling_config.reserved = self.reserved
-        node.network_config.subnetwork = self.nodeset.subnetwork
-        node.network_config.enable_external_ips = self.enable_public_ip
-        if self.data_disks:
-            node.data_disks = self.data_disks
-
-        request = tpu.CreateNodeRequest(parent=self._parent, node=node, node_id=node_id)
-        resp = self._client.create_node(request=request).result()
-        if not self.__check_resp(resp, "create"):
-            return False
-        if isinstance(nodename, list):
-            for node_id, net_endpoint in zip(nodename, resp.network_endpoints):
-                self._register_node(node_id, net_endpoint.ip_address)
-        else:
-            ip_add = resp.network_endpoints[0].ip_address
-            self._register_node(nodename, ip_add)
-        return True
-
-    def delete_node(self, nodename):
-        request = tpu.DeleteNodeRequest(name=f"{self._parent}/nodes/{nodename}")
-        try:
-            resp = self._client.delete_node(request=request).result()
-            if resp:
-                return self.get_node(nodename=nodename) is None
-            return False
-        except gExceptions.NotFound:
-            # log only error if vmcount is 1 as for other tpu vm count, this could be "phantom" nodes
-            if self.vmcount == 1:
-                log.error(f"Tpu single node {nodename} not found")
-            else:
-                # for the TPU nodes that consist in more than one vm, only the first node of the TPU a.k.a. the master node will
-                # exist as real TPU nodes, so the other ones are expected to not be found, check the hostname of the node that has
-                # not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence
-                # log an error
-                nodehostname = yaml.safe_load(
-                    run(f"{lookup().scontrol} --yaml show node {nodename}").stdout.rstrip()
-                )["nodes"][0]["hostname"]
-                if nodehostname.split("-")[-1] == "0":
-                    log.error(f"TPU master node {nodename} not found")
-                else:
-                    log.info(f"Deleted TPU 'phantom' node {nodename}")
-            # If the node is not found it is tecnichally deleted, so return success.
-            return True
-
-
 @dataclass(frozen=True)
 class ReservationDetails:
     project: str

From f8c81a3e49969c4bff5d7379d4a4a69a677214f8 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Fri, 13 Dec 2024 18:17:42 +0000
Subject: [PATCH 042/140] SlurmGCP. Don't query insert-ops if builkInsert-op is
 totally successful

---
 .../modules/slurm_files/scripts/resume.py     | 96 +++++++++++--------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 5d88751a41..a26762ced6 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -387,48 +387,66 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
     log.debug(f"tpu_start_data={yaml.safe_dump(tpu_start_data)}")
     execute_with_futures(start_tpu, tpu_start_data)
 
-    all_successful_inserts = []
-
-    for group, bulk_op in bulk_operations.items():
-        group_id = bulk_op["operationGroupId"]
-        bulk_op_name = bulk_op["name"]
-        if "error" in bulk_op:
-            error = bulk_op["error"]["errors"][0]
-            group_nodes = to_hostlist(grouped_nodes[group].nodes)
-            log.warning(
-                f"bulkInsert operation errors: {error['code']} name={bulk_op_name} operationGroupId={group_id} nodes={group_nodes}"
-            )
-        successful_inserts, failed_inserts = separate(
-            lambda op: "error" in op, get_insert_operations(group_id)
+    for group, op in bulk_operations.items():
+        _handle_bulk_insert_op(op, grouped_nodes[group].nodes, resume_data)
+        
+
+def _handle_bulk_insert_op(op: object, nodes: List[str], resume_data: Optional[ResumeData]) -> None:
+    """
+    Handles **DONE** BulkInsert operations
+    """
+    assert op["operationType"] == "bulkInsert" and op["status"] == "DONE", f"unexpected op: {op}"
+
+    group_id = op["operationGroupId"]
+    if "error" in op:
+        error = op["error"]["errors"][0]
+        log.warning(
+            f"bulkInsert operation error: {error['code']} name={op['name']} operationGroupId={group_id} nodes={to_hostlist(nodes)}"
         )
-        # Apparently multiple errors are possible... so join with +.
-        by_error_inserts = util.groupby_unsorted(
-            failed_inserts,
-            lambda op: "+".join(err["code"] for err in op["error"]["errors"]),
+        # TODO: does it make sense to query for insert-ops in case of bulkInsert-op error?
+    
+    created = 0
+    for status in op["instancesBulkInsertOperationMetadata"]["perLocationStatus"].values():
+        created += status.get("createdVmCount", 0)
+    if created == len(nodes):
+        log.info(f"created {len(nodes)} instances: nodes={to_hostlist(nodes)}")
+        return # no need to gather status of insert-operations.
+
+    # TODO:
+    # * don't perform globalOperations aggregateList request to gather insert-operations,
+    #   instead use specific locations from `instancesBulkInsertOperationMetadata`,
+    #   most of the time single zone should be sufficient.
+    # * don't gather insert-operations per bulkInsert request, instead aggregate it across
+    #   all bulkInserts (goes one level above this function) 
+    successful_inserts, failed_inserts = separate(
+        lambda op: "error" in op, get_insert_operations(group_id)
+    )
+    # Apparently multiple errors are possible... so join with +.
+    by_error_inserts = util.groupby_unsorted(
+        failed_inserts,
+        lambda op: "+".join(err["code"] for err in op["error"]["errors"]),
+    )
+    for code, failed_ops in by_error_inserts:
+        failed_nodes = {trim_self_link(op["targetLink"]): op for op in failed_ops}
+        hostlist = util.to_hostlist(failed_nodes)
+        count = len(failed_nodes)
+        log.error(
+            f"{count} instances failed to start: {code} ({hostlist}) operationGroupId={group_id}"
+        )
+        failed_node, failed_op = next(iter(failed_nodes.items()))
+        msg = "; ".join(
+            f"{err['code']}: {err['message'] if 'message' in err else 'no message'}"
+            for err in failed_op["error"]["errors"]
+        )
+        if code != "RESOURCE_ALREADY_EXISTS":
+            down_nodes_notify_jobs(failed_nodes, f"GCP Error: {msg}", resume_data)
+        log.error(
+            f"errors from insert for node '{failed_node}' ({failed_op['name']}): {msg}"
         )
-        for code, failed_ops in by_error_inserts:
-            failed_nodes = {trim_self_link(op["targetLink"]): op for op in failed_ops}
-            hostlist = util.to_hostlist(failed_nodes)
-            count = len(failed_nodes)
-            log.error(
-                f"{count} instances failed to start: {code} ({hostlist}) operationGroupId={group_id}"
-            )
-            failed_node, failed_op = next(iter(failed_nodes.items()))
-            msg = "; ".join(
-                f"{err['code']}: {err['message'] if 'message' in err else 'no message'}"
-                for err in failed_op["error"]["errors"]
-            )
-            if code != "RESOURCE_ALREADY_EXISTS":
-                down_nodes_notify_jobs(failed_nodes, f"GCP Error: {msg}", resume_data)
-            log.error(
-                f"errors from insert for node '{failed_node}' ({failed_op['name']}): {msg}"
-            )
 
-        ready_nodes = {trim_self_link(op["targetLink"]) for op in successful_inserts}
-        if len(ready_nodes) > 0:
-            ready_nodelist = to_hostlist(ready_nodes)
-            log.info(f"created {len(ready_nodes)} instances: nodes={ready_nodelist}")
-            all_successful_inserts.extend(successful_inserts)
+    ready_nodes = {trim_self_link(op["targetLink"]) for op in successful_inserts}
+    if len(ready_nodes) > 0:
+        log.info(f"created {len(ready_nodes)} instances: nodes={to_hostlist(ready_nodes)}")
 
 
 def down_nodes_notify_jobs(nodes: List[str], reason: str, resume_data: Optional[ResumeData]) -> None:

From a39d1c215c4d678974d05df5768449e0fa1598d1 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Wed, 18 Dec 2024 19:19:26 +0000
Subject: [PATCH 043/140] Update ops to operation

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 73272df3cb..46ab3d11c0 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1525,11 +1525,11 @@ This blueprint shows how to use managed parallelstore storage options with GKE i
 The blueprint contains the following:
 
 * A K8s Job that uses a managed parallelstore storage volume option.
-* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk ops.
+* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk operation.
 
 > **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module.
 > The lifecycle of the parallelstore is managed by the blueprint.
-> On glcuster destroy ops, the Parallelstore storage created will also be destroyed.
+> On glcuster destroy operation, the Parallelstore storage created will also be destroyed.
 >
 > [!Note]
 > The Kubernetes API server will only allow requests from authorized networks.

From 8c26d4a1aac024b0ce848e5e1ab7b962b8e1812b Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Wed, 18 Dec 2024 18:18:15 +0000
Subject: [PATCH 044/140] update tas job definitions and add required
 toleration to kueue v0.10.0

---
 .../management/kubectl-apply/manifests/kueue-v0.10.0.yaml   | 1 +
 .../blueprints/kueue-config-files/tas-queues.yaml           | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml b/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml
index 696e9b1ffb..8fb5db3638 100644
--- a/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml
+++ b/modules/management/kubectl-apply/manifests/kueue-v0.10.0.yaml
@@ -12465,6 +12465,7 @@ spec:
       - configMap:
           name: kueue-manager-config
         name: manager-config
+      tolerations:
       - effect: NoSchedule
         key: components.gke.io/gke-managed-components
         operator: Equal
diff --git a/tools/cloud-build/daily-tests/blueprints/kueue-config-files/tas-queues.yaml b/tools/cloud-build/daily-tests/blueprints/kueue-config-files/tas-queues.yaml
index adaae65769..139bd3cbf5 100644
--- a/tools/cloud-build/daily-tests/blueprints/kueue-config-files/tas-queues.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/kueue-config-files/tas-queues.yaml
@@ -31,6 +31,10 @@ spec:
   nodeLabels:
     cloud.google.com/gke-nodepool: "a2-highgpu-2g-a2highgpupool"
   topologyName: "gke-default"
+  tolerations:
+  - key: "nvidia.com/gpu"
+    operator: "Exists"
+    effect: NoSchedule
 ---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: ClusterQueue
@@ -44,7 +48,7 @@ spec:
     - name: "tas-flavor"
       resources:
       - name: "nvidia.com/gpu"
-        nominalQuota: 12        # 6 nodes, 2 GPU each
+        nominalQuota: 10000000  # infinite quota
 ---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue

From af1575358170a98bb0286fdada078b4e0b2d6fc2 Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Wed, 18 Dec 2024 20:00:56 +0000
Subject: [PATCH 045/140] fix linter errors and deploy and test the blueprint

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml |  2 +-
 examples/gke-a3-ultragpu/nccl-installer.yaml  | 70 +++++++++----------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 7069b90797..72bf5e2bf2 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -110,7 +110,7 @@ deployment_groups:
       - name: no-minor-or-node-upgrades-indefinite
         start_time: "2024-12-01T00:00:00Z"
         end_time: "2025-12-22T00:00:00Z"
-        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES      
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
       additional_networks:
         $(concat(
           [{
diff --git a/examples/gke-a3-ultragpu/nccl-installer.yaml b/examples/gke-a3-ultragpu/nccl-installer.yaml
index f2239b2584..0227658184 100644
--- a/examples/gke-a3-ultragpu/nccl-installer.yaml
+++ b/examples/gke-a3-ultragpu/nccl-installer.yaml
@@ -36,45 +36,45 @@ spec:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
-              - matchExpressions:
-                  - key: cloud.google.com/gke-accelerator
-                    operator: In
-                    values:
-                      - nvidia-h200-141gb
+            - matchExpressions:
+              - key: cloud.google.com/gke-accelerator
+                operator: In
+                values:
+                - nvidia-h200-141gb
       tolerations:
-        - operator: "Exists"
+      - operator: "Exists"
       hostNetwork: true
       hostPID: true
       volumes:
+      - name: library-dir-host
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/lib64
+          type: DirectoryOrCreate
+      - name: gib
+        hostPath:
+          path: /home/kubernetes/bin/gib
+      initContainers:
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+        name: nccl-rdma-installer
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        volumeMounts:
         - name: library-dir-host
-          hostPath:
-            path: /home/kubernetes/bin/nvidia/lib64
-            type: DirectoryOrCreate
+          mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
         - name: gib
-          hostPath:
-            path: /home/kubernetes/bin/gib
-      initContainers:
-        - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
-          name: nccl-rdma-installer
-          resources:
-            requests:
-              cpu: 150m
-          securityContext:
-            privileged: true
-          volumeMounts:
-            - name: library-dir-host
-              mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
-            - name: gib
-              mountPath: /usr/local/home/kubernetes/bin/gib
-          command: ["/bin/sh", "-c"]
-          args:
-            - |
-              set -ex
-              /scripts/container_entry.sh install --install-nccl
-              cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
-              cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
-              ibv_devinfo || exit 1
-              echo "installation finishes"
+          mountPath: /usr/local/home/kubernetes/bin/gib
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+          cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+          ibv_devinfo || exit 1
+          echo "installation finishes"
       containers:
-        - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
-          name: pause
+      - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+        name: pause

From 53571b3ac70caf362f05d0427db18b397b2858ae Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:08:52 +0000
Subject: [PATCH 046/140] fix linter errors

---
 .../gke-a3-ultragpu-deployment.yaml           |  14 +-
 .../gke-a3-ultragpu/nccl-jobset-example.yaml  | 385 +++++++++---------
 .../gke-a3-ultragpu/nccl-test-32-node.yaml    | 385 +++++++++---------
 3 files changed, 405 insertions(+), 379 deletions(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
index 0e475ec2d6..ae897e23d7 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
@@ -16,15 +16,15 @@
 terraform_backend_defaults:
   type: gcs
   configuration:
-    bucket: BUCKET_NAME
+    bucket: gke-a3u-manual-test
 
 vars:
   deployment_name: gke-a3-ultra
-  project_id: PROJECT_ID
-  region: COMPUTE_REGION
-  zone: COMPUTE_ZONE
-  authorized_cidr: <IP_ADDRESS>/<SUFFIX>
+  project_id: hpc-toolkit-dev
+  region: europe-west1
+  zone: europe-west1-b
+  authorized_cidr: 0.0.0.0/0
   # In order to not target a BLOCK_NAME, extended_reservation can be inputted as
   # extended_reservation: RESERVATION_NAME
-  extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
-  static_node_count: NODE_COUNT
+  extended_reservation: slurm-dev-gcp-a3u-gsc
+  static_node_count: 0
diff --git a/examples/gke-a3-ultragpu/nccl-jobset-example.yaml b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
index da49668d0a..4e3a437604 100644
--- a/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
+++ b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
@@ -1,3 +1,17 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
@@ -9,200 +23,199 @@ spec:
   network:
     enableDNSHostnames: true
   replicatedJobs:
-    - name: w
-      template:
-        spec:
-          parallelism: 4
-          completions: 4
-
-          template:
-            metadata:
-              annotations:
-                networking.gke.io/default-interface: 'eth0'
-                networking.gke.io/interfaces: |
-                  [
-                    {"interfaceName":"eth0","network":"default"},
-                    {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
-                    {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
-                    {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
-                    {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
-                    {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
-                    {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
-                    {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
-                    {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
-                    {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
-                  ]
-            spec:
-              # Limit benchmark run duration
-              activeDeadlineSeconds: 3600
-              restartPolicy: Never
-              nodeSelector:
-                cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
-              tolerations:
-              - key: cloud.google.com/gke-queued
-                effect: NoSchedule
-                value: "true"
-
-              - key: "nvidia.com/gpu"
-                operator: "Exists"
-                effect: "NoSchedule"
-
-              setHostnameAsFQDN: true
-              volumes:
-              - name: gib
-                hostPath:
-                  path: /home/kubernetes/bin/gib
+  - name: w
+    template:
+      spec:
+        parallelism: 4
+        completions: 4
+
+        template:
+          metadata:
+            annotations:
+              networking.gke.io/default-interface: 'eth0'
+              networking.gke.io/interfaces: |
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+                  {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+                  {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+                  {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+                  {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+                  {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+                  {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+                  {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+                  {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+                ]
+          spec:
+            # Limit benchmark run duration
+            activeDeadlineSeconds: 3600
+            restartPolicy: Never
+            nodeSelector:
+              cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+            tolerations:
+            - key: cloud.google.com/gke-queued
+              effect: NoSchedule
+              value: "true"
+
+            - key: "nvidia.com/gpu"
+              operator: "Exists"
+              effect: "NoSchedule"
+
+            setHostnameAsFQDN: true
+            volumes:
+            - name: gib
+              hostPath:
+                path: /home/kubernetes/bin/gib
+            - name: nvidia
+              hostPath:
+                path: /home/kubernetes/bin/nvidia
+            - name: lib64
+              hostPath:
+                path: /lib64
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+            - name: sys
+              hostPath:
+                path: /sys
+            - name: proc-sys
+              hostPath:
+                path: /proc/sys
+            schedulingGates:
+            # Set this to a unique name per job.
+            - name: "gke.io/topology-aware-auto-ag-4"
+
+            initContainers:
+            - name: gpu-healthcheck
+              image: alpine:latest
+              command: ["/bin/sh", "-c"]
+              args:
+              - |
+                apk add --no-cache bash  # Install bash
+                /bin/bash -c "set -ex
+                NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                if [ \${NUM_GPUS} -lt 8 ]; then
+                  echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                  exit 1
+                fi
+                gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                for gpu_index in \${!gpu_errors[@]}; do
+                    if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                        echo 'Error: ERR detected in GPU index '\$gpu_index
+                        exit 1
+                    elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                        echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                        exit 1
+                    fi
+                done
+                echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+
+              volumeMounts:
               - name: nvidia
-                hostPath:
-                  path: /home/kubernetes/bin/nvidia
+                mountPath: /usr/local/nvidia
               - name: lib64
-                hostPath:
-                  path: /lib64
-              - name: shared-memory
-                emptyDir:
-                  medium: "Memory"
-                  sizeLimit: 250Gi
-              - name: sys
-                hostPath:
-                  path: /sys
-              - name: proc-sys
-                hostPath:
-                  path: /proc/sys
-              schedulingGates:
-              # Set this to a unique name per job.
-              - name: "gke.io/topology-aware-auto-ag-4"
-
-              initContainers:
-              - name: gpu-healthcheck
-                image: alpine:latest
-                command: ["/bin/sh", "-c"]
-                args:
-                  - |
-                    apk add --no-cache bash  # Install bash
-                    /bin/bash -c "set -ex
-                    NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
-                    if [ \${NUM_GPUS} -lt 8 ]; then
-                      echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
-                      exit 1
-                    fi
-                    gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
-                    for gpu_index in \${!gpu_errors[@]}; do
-                        if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
-                            echo 'Error: ERR detected in GPU index '\$gpu_index
-                            exit 1
-                        elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
-                            echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
-                            exit 1
-                        fi
-                    done
-                    echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
-
-                volumeMounts:
-                - name: nvidia
-                  mountPath: /usr/local/nvidia
-                - name: lib64
-                  mountPath: /lib64
-                securityContext:
-                  privileged: true
-                env:
-                - name: LD_LIBRARY_PATH
-                  value: /usr/local/nvidia/lib64
-
-              containers:
-              - name: nccl
-                stdin: true
-                tty: true
-                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
-                securityContext:
-                  privileged: true
-                env:
-                - name: MY_NODE_NAME
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: spec.nodeName
-                - name: OMPI_ALLOW_RUN_AS_ROOT
-                  value: "1"
-                - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
-                  value: "1"
-                command:
-                - bash
-                - -c
-                - |
-                  set -x
-                  export N_NODES=4
-                  echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
-
-                  # Load all the cuda libs
-                  /sbin/ldconfig
-
-                  # Install ping
-                  apt update -y
-                  apt install -y iputils-ping
-
-                  # Start sshd
-                  /scripts/container_entry.sh daemon &
-
-                  # Get helper variables to form all hostnames
-                  export POSTFIX=$(hostname | cut -d . -f 2-)
-                  export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
-                  export NODE_RANK=$JOB_COMPLETION_INDEX
-
-
-                  # For every worker, wait till online and add to hostfile
-                  for i in `seq 0 $(($N_NODES-1))`; do
-                    OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
-                    until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
-                      echo Waiting for ${OTHER}...
-                      sleep 10
-                    done
-                    echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                mountPath: /lib64
+              securityContext:
+                privileged: true
+              env:
+              - name: LD_LIBRARY_PATH
+                value: /usr/local/nvidia/lib64
+
+            containers:
+            - name: nccl
+              stdin: true
+              tty: true
+              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+              securityContext:
+                privileged: true
+              env:
+              - name: MY_NODE_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: spec.nodeName
+              - name: OMPI_ALLOW_RUN_AS_ROOT
+                value: "1"
+              - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                value: "1"
+              command:
+              - bash
+              - -c
+              - |
+                set -x
+                export N_NODES=4
+                echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
+
+                # Load all the cuda libs
+                /sbin/ldconfig
+
+                # Install ping
+                apt update -y
+                apt install -y iputils-ping
+
+                # Start sshd
+                /scripts/container_entry.sh daemon &
+
+                # Get helper variables to form all hostnames
+                export POSTFIX=$(hostname | cut -d . -f 2-)
+                export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                export NODE_RANK=$JOB_COMPLETION_INDEX
+
+
+                # For every worker, wait till online and add to hostfile
+                for i in `seq 0 $(($N_NODES-1))`; do
+                  OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                  until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
+                    echo Waiting for ${OTHER}...
+                    sleep 10
                   done
+                  echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                done
 
-                  cat /tmp/hostfile
+                cat /tmp/hostfile
 
-                  # Launch from head node
-                  if [[ "${NODE_RANK}" -eq "0" ]]; then
+                # Launch from head node
+                if [[ "${NODE_RANK}" -eq "0" ]]; then
 
-                      # World Level = 0x0, Rail Aligned = 0x7
-                      export NCCL_TESTS_SPLIT_MASK="0x0";
+                    # World Level = 0x0, Rail Aligned = 0x7
+                    export NCCL_TESTS_SPLIT_MASK="0x0";
 
-                      # Force use of libnccl-gib
-                      export NCCL_NET=gIB
+                    # Force use of libnccl-gib
+                    export NCCL_NET=gIB
 
-                      # Set all the correct libnccl-gib environment variables
-                      source /usr/local/gib/scripts/set_nccl_env.sh
+                    # Set all the correct libnccl-gib environment variables
+                    source /usr/local/gib/scripts/set_nccl_env.sh
 
-                      # Get all relevant NCCL / env vars to pass to all workers
-                      ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
+                    # Get all relevant NCCL / env vars to pass to all workers
+                    ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
 
-                      mpirun --hostfile /tmp/hostfile \
-                        -x $ENV_VARS  \
-                        -mca plm_rsh_no_tree_spawn 1 \
-                        --mca orte_keep_fqdn_hostnames 1 \
-                        --mca btl self,tcp \
-                        --mca btl_tcp_if_include eth0 \
-                        --bind-to none \
-                        --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
-                        /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
+                    mpirun --hostfile /tmp/hostfile \
+                      -x $ENV_VARS  \
+                      -mca plm_rsh_no_tree_spawn 1 \
+                      --mca orte_keep_fqdn_hostnames 1 \
+                      --mca btl self,tcp \
+                      --mca btl_tcp_if_include eth0 \
+                      --bind-to none \
+                      --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
+                      /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
 
-                  else
-                      while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
-                      sleep 5
-                  done
-                  fi
-
-                  exit 0
-
-                volumeMounts:
-                - name: nvidia
-                  mountPath: /usr/local/nvidia
-                - name: gib
-                  mountPath: /usr/local/gib
-                - name: shared-memory
-                  mountPath: /dev/shm
-                resources:
-                  limits:
-                    nvidia.com/gpu: 8
-                  requests:
-                    nvidia.com/gpu: 8
-              restartPolicy: Never
+                else
+                    while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                    sleep 5
+                done
+                fi
+
+                exit 0
+
+              volumeMounts:
+              - name: nvidia
+                mountPath: /usr/local/nvidia
+              - name: gib
+                mountPath: /usr/local/gib
+              - name: shared-memory
+                mountPath: /dev/shm
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+                requests:
+                  nvidia.com/gpu: 8
diff --git a/examples/gke-a3-ultragpu/nccl-test-32-node.yaml b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
index 3ce2b490d6..3f51ecd239 100644
--- a/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
+++ b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
@@ -1,3 +1,17 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
@@ -9,200 +23,199 @@ spec:
   network:
     enableDNSHostnames: true
   replicatedJobs:
-    - name: w
-      template:
-        spec:
-          parallelism: 32
-          completions: 32
-
-          template:
-            metadata:
-              annotations:
-                networking.gke.io/default-interface: 'eth0'
-                networking.gke.io/interfaces: |
-                  [
-                    {"interfaceName":"eth0","network":"default"},
-                    {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
-                    {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
-                    {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
-                    {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
-                    {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
-                    {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
-                    {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
-                    {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
-                    {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
-                  ]
-            spec:
-              # Limit benchmark run duration
-              activeDeadlineSeconds: 3600
-              restartPolicy: Never
-              nodeSelector:
-                cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
-              tolerations:
-              - key: cloud.google.com/gke-queued
-                effect: NoSchedule
-                value: "true"
-
-              - key: "nvidia.com/gpu"
-                operator: "Exists"
-                effect: "NoSchedule"
-
-              setHostnameAsFQDN: true
-              volumes:
-              - name: gib
-                hostPath:
-                  path: /home/kubernetes/bin/gib
+  - name: w
+    template:
+      spec:
+        parallelism: 32
+        completions: 32
+
+        template:
+          metadata:
+            annotations:
+              networking.gke.io/default-interface: 'eth0'
+              networking.gke.io/interfaces: |
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gke-a3-ultra-sub-1"},
+                  {"interfaceName":"eth2","network":"gke-a3-ultra-rdma-sub-0"},
+                  {"interfaceName":"eth3","network":"gke-a3-ultra-rdma-sub-1"},
+                  {"interfaceName":"eth4","network":"gke-a3-ultra-rdma-sub-2"},
+                  {"interfaceName":"eth5","network":"gke-a3-ultra-rdma-sub-3"},
+                  {"interfaceName":"eth6","network":"gke-a3-ultra-rdma-sub-4"},
+                  {"interfaceName":"eth7","network":"gke-a3-ultra-rdma-sub-5"},
+                  {"interfaceName":"eth8","network":"gke-a3-ultra-rdma-sub-6"},
+                  {"interfaceName":"eth9","network":"gke-a3-ultra-rdma-sub-7"}
+                ]
+          spec:
+            # Limit benchmark run duration
+            activeDeadlineSeconds: 3600
+            restartPolicy: Never
+            nodeSelector:
+              cloud.google.com/gke-nodepool: a3-ultragpu-8g-a3-ultragpu-pool
+            tolerations:
+            - key: cloud.google.com/gke-queued
+              effect: NoSchedule
+              value: "true"
+
+            - key: "nvidia.com/gpu"
+              operator: "Exists"
+              effect: "NoSchedule"
+
+            setHostnameAsFQDN: true
+            volumes:
+            - name: gib
+              hostPath:
+                path: /home/kubernetes/bin/gib
+            - name: nvidia
+              hostPath:
+                path: /home/kubernetes/bin/nvidia
+            - name: lib64
+              hostPath:
+                path: /lib64
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+            - name: sys
+              hostPath:
+                path: /sys
+            - name: proc-sys
+              hostPath:
+                path: /proc/sys
+            schedulingGates:
+            # Set this to a unique name per job.
+            - name: "gke.io/topology-aware-auto-ag-32"
+
+            initContainers:
+            - name: gpu-healthcheck
+              image: alpine:latest
+              command: ["/bin/sh", "-c"]
+              args:
+              - |
+                apk add --no-cache bash  # Install bash
+                /bin/bash -c "set -ex
+                NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
+                if [ \${NUM_GPUS} -lt 8 ]; then
+                  echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
+                  exit 1
+                fi
+                gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
+                for gpu_index in \${!gpu_errors[@]}; do
+                    if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
+                        echo 'Error: ERR detected in GPU index '\$gpu_index
+                        exit 1
+                    elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
+                        echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
+                        exit 1
+                    fi
+                done
+                echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
+
+              volumeMounts:
               - name: nvidia
-                hostPath:
-                  path: /home/kubernetes/bin/nvidia
+                mountPath: /usr/local/nvidia
               - name: lib64
-                hostPath:
-                  path: /lib64
-              - name: shared-memory
-                emptyDir:
-                  medium: "Memory"
-                  sizeLimit: 250Gi
-              - name: sys
-                hostPath:
-                  path: /sys
-              - name: proc-sys
-                hostPath:
-                  path: /proc/sys
-              schedulingGates:
-              # Set this to a unique name per job.
-              - name: "gke.io/topology-aware-auto-ag-32"
-
-              initContainers:
-              - name: gpu-healthcheck
-                image: alpine:latest
-                command: ["/bin/sh", "-c"]
-                args:
-                  - |
-                    apk add --no-cache bash  # Install bash
-                    /bin/bash -c "set -ex
-                    NUM_GPUS=$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | wc -l)
-                    if [ \${NUM_GPUS} -lt 8 ]; then
-                      echo \"Error: Only \${NUM_GPUS} GPUs and expected 8\"
-                      exit 1
-                    fi
-                    gpu_errors=(\$(/usr/local/nvidia/bin/nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader,nounits))
-                    for gpu_index in \${!gpu_errors[@]}; do
-                        if [ \${gpu_errors[\$gpu_index]} == '[N/A]' ]; then
-                            echo 'Error: ERR detected in GPU index '\$gpu_index
-                            exit 1
-                        elif [ \${gpu_errors[\$gpu_index]} -gt 0 ]; then
-                            echo 'Error: Unrecoverable ECC errors detected in GPU index '\$gpu_index
-                            exit 1
-                        fi
-                    done
-                    echo \${NUM_GPUS} GPUs found with no ERR or Unrecoverable ECC errors"
-
-                volumeMounts:
-                - name: nvidia
-                  mountPath: /usr/local/nvidia
-                - name: lib64
-                  mountPath: /lib64
-                securityContext:
-                  privileged: true
-                env:
-                - name: LD_LIBRARY_PATH
-                  value: /usr/local/nvidia/lib64
-
-              containers:
-              - name: nccl
-                stdin: true
-                tty: true
-                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
-                securityContext:
-                  privileged: true
-                env:
-                - name: MY_NODE_NAME
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: spec.nodeName
-                - name: OMPI_ALLOW_RUN_AS_ROOT
-                  value: "1"
-                - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
-                  value: "1"
-                command:
-                - bash
-                - -c
-                - |
-                  set -x
-                  export N_NODES=32
-                  echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
-
-                  # Load all the cuda libs
-                  /sbin/ldconfig
-
-                  # Install ping
-                  apt update -y
-                  apt install -y iputils-ping
-
-                  # Start sshd
-                  /scripts/container_entry.sh daemon &
-
-                  # Get helper variables to form all hostnames
-                  export POSTFIX=$(hostname | cut -d . -f 2-)
-                  export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
-                  export NODE_RANK=$JOB_COMPLETION_INDEX
-
-
-                  # For every worker, wait till online and add to hostfile
-                  for i in `seq 0 $(($N_NODES-1))`; do
-                    OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
-                    until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
-                      echo Waiting for ${OTHER}...
-                      sleep 10
-                    done
-                    echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                mountPath: /lib64
+              securityContext:
+                privileged: true
+              env:
+              - name: LD_LIBRARY_PATH
+                value: /usr/local/nvidia/lib64
+
+            containers:
+            - name: nccl
+              stdin: true
+              tty: true
+              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+              securityContext:
+                privileged: true
+              env:
+              - name: MY_NODE_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: spec.nodeName
+              - name: OMPI_ALLOW_RUN_AS_ROOT
+                value: "1"
+              - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+                value: "1"
+              command:
+              - bash
+              - -c
+              - |
+                set -x
+                export N_NODES=32
+                echo "Starting workload container on ${MY_NODE_NAME} for $N_NODES benchmark"
+
+                # Load all the cuda libs
+                /sbin/ldconfig
+
+                # Install ping
+                apt update -y
+                apt install -y iputils-ping
+
+                # Start sshd
+                /scripts/container_entry.sh daemon &
+
+                # Get helper variables to form all hostnames
+                export POSTFIX=$(hostname | cut -d . -f 2-)
+                export WORKERS_BASENAME=$(hostname | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+                export NODE_RANK=$JOB_COMPLETION_INDEX
+
+
+                # For every worker, wait till online and add to hostfile
+                for i in `seq 0 $(($N_NODES-1))`; do
+                  OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+                  until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
+                    echo Waiting for ${OTHER}...
+                    sleep 10
                   done
+                  echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+                done
 
-                  cat /tmp/hostfile
+                cat /tmp/hostfile
 
-                  # Launch from head node
-                  if [[ "${NODE_RANK}" -eq "0" ]]; then
+                # Launch from head node
+                if [[ "${NODE_RANK}" -eq "0" ]]; then
 
-                      # World Level = 0x0, Rail Aligned = 0x7
-                      export NCCL_TESTS_SPLIT_MASK="0x0";
+                    # World Level = 0x0, Rail Aligned = 0x7
+                    export NCCL_TESTS_SPLIT_MASK="0x0";
 
-                      # Force use of libnccl-gib
-                      export NCCL_NET=gIB
+                    # Force use of libnccl-gib
+                    export NCCL_NET=gIB
 
-                      # Set all the correct libnccl-gib environment variables
-                      source /usr/local/gib/scripts/set_nccl_env.sh
+                    # Set all the correct libnccl-gib environment variables
+                    source /usr/local/gib/scripts/set_nccl_env.sh
 
-                      # Get all relevant NCCL / env vars to pass to all workers
-                      ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
+                    # Get all relevant NCCL / env vars to pass to all workers
+                    ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
 
-                      mpirun --hostfile /tmp/hostfile \
-                        -x $ENV_VARS  \
-                        -mca plm_rsh_no_tree_spawn 1 \
-                        --mca orte_keep_fqdn_hostnames 1 \
-                        --mca btl self,tcp \
-                        --mca btl_tcp_if_include eth0 \
-                        --bind-to none \
-                        --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
-                        /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
+                    mpirun --hostfile /tmp/hostfile \
+                      -x $ENV_VARS  \
+                      -mca plm_rsh_no_tree_spawn 1 \
+                      --mca orte_keep_fqdn_hostnames 1 \
+                      --mca btl self,tcp \
+                      --mca btl_tcp_if_include eth0 \
+                      --bind-to none \
+                      --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
+                      /third_party/nccl-tests/build/all_gather_perf -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 -c 1
 
-                  else
-                      while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
-                      sleep 5
-                  done
-                  fi
-
-                  exit 0
-
-                volumeMounts:
-                - name: nvidia
-                  mountPath: /usr/local/nvidia
-                - name: gib
-                  mountPath: /usr/local/gib
-                - name: shared-memory
-                  mountPath: /dev/shm
-                resources:
-                  limits:
-                    nvidia.com/gpu: 8
-                  requests:
-                    nvidia.com/gpu: 8
-              restartPolicy: Never
+                else
+                    while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+                    sleep 5
+                done
+                fi
+
+                exit 0
+
+              volumeMounts:
+              - name: nvidia
+                mountPath: /usr/local/nvidia
+              - name: gib
+                mountPath: /usr/local/gib
+              - name: shared-memory
+                mountPath: /dev/shm
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+                requests:
+                  nvidia.com/gpu: 8

From e3a374aef529d17659a1e00e826c389fcb4f9be4 Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:23:28 +0000
Subject: [PATCH 047/140] fix more linter errors

---
 examples/gke-a3-ultragpu/nccl-test.yaml | 122 +++++++++++-------------
 1 file changed, 58 insertions(+), 64 deletions(-)

diff --git a/examples/gke-a3-ultragpu/nccl-test.yaml b/examples/gke-a3-ultragpu/nccl-test.yaml
index 994601472f..9b4fd881b7 100644
--- a/examples/gke-a3-ultragpu/nccl-test.yaml
+++ b/examples/gke-a3-ultragpu/nccl-test.yaml
@@ -53,41 +53,38 @@ metadata:
       ]
 spec:
   volumes:
+  - name: library-dir-host
+    hostPath:
+      path: /home/kubernetes/bin/nvidia
+  - name: gib
+    hostPath:
+      path: /home/kubernetes/bin/gib
+  - name: shared-memory
+    emptyDir:
+      medium: "Memory"
+      sizeLimit: 250Gi
+  containers:
+  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+    name: test
+    volumeMounts:
     - name: library-dir-host
-      hostPath:
-        path: /home/kubernetes/bin/nvidia
+      mountPath: /usr/local/nvidia
     - name: gib
-      hostPath:
-        path: /home/kubernetes/bin/gib
+      mountPath: /usr/local/gib
     - name: shared-memory
-      emptyDir:
-        medium: "Memory"
-        sizeLimit: 250Gi
-  containers:
-    - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
-      name: test
-      resources:
-        requests:
-          cpu: 150m
-      volumeMounts:
-        - name: library-dir-host
-          mountPath: /usr/local/nvidia
-        - name: gib
-          mountPath: /usr/local/gib
-        - name: shared-memory
-          mountPath: /dev/shm
-      env:
-        - name: LD_LIBRARY_PATH
-          value: /usr/local/nvidia/lib64
-      resources:
-        limits:
-          nvidia.com/gpu: 8
-      command: ["/bin/bash", "-c"]
-      args:
-        - |
-          /scripts/container_entry.sh shell
-          source /usr/local/gib/scripts/set_nccl_env.sh
-          sleep infinity
+      mountPath: /dev/shm
+    env:
+    - name: LD_LIBRARY_PATH
+      value: /usr/local/nvidia/lib64
+    resources:
+      limits:
+        nvidia.com/gpu: 8
+    command: ["/bin/bash", "-c"]
+    args:
+    - |
+      /scripts/container_entry.sh shell
+      source /usr/local/gib/scripts/set_nccl_env.sh
+      sleep infinity
 ---
 apiVersion: v1
 kind: Pod
@@ -112,38 +109,35 @@ metadata:
       ]
 spec:
   volumes:
+  - name: library-dir-host
+    hostPath:
+      path: /home/kubernetes/bin/nvidia
+  - name: gib
+    hostPath:
+      path: /home/kubernetes/bin/gib
+  - name: shared-memory
+    emptyDir:
+      medium: "Memory"
+      sizeLimit: 250Gi
+  containers:
+  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+    name: test
+    volumeMounts:
     - name: library-dir-host
-      hostPath:
-        path: /home/kubernetes/bin/nvidia
+      mountPath: /usr/local/nvidia
     - name: gib
-      hostPath:
-        path: /home/kubernetes/bin/gib
+      mountPath: /usr/local/gib
     - name: shared-memory
-      emptyDir:
-        medium: "Memory"
-        sizeLimit: 250Gi
-  containers:
-    - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
-      name: test
-      resources:
-        requests:
-          cpu: 150m
-      volumeMounts:
-        - name: library-dir-host
-          mountPath: /usr/local/nvidia
-        - name: gib
-          mountPath: /usr/local/gib
-        - name: shared-memory
-          mountPath: /dev/shm
-      env:
-        - name: LD_LIBRARY_PATH
-          value: /usr/local/nvidia/lib64
-      resources:
-        limits:
-          nvidia.com/gpu: 8
-      command: ["/bin/bash", "-c"]
-      args:
-        - |
-          /scripts/container_entry.sh shell
-          source /usr/local/gib/scripts/set_nccl_env.sh
-          sleep infinity
+      mountPath: /dev/shm
+    env:
+    - name: LD_LIBRARY_PATH
+      value: /usr/local/nvidia/lib64
+    resources:
+      limits:
+        nvidia.com/gpu: 8
+    command: ["/bin/bash", "-c"]
+    args:
+    - |
+      /scripts/container_entry.sh shell
+      source /usr/local/gib/scripts/set_nccl_env.sh
+      sleep infinity

From cb8d7d0a85dab53e264a8fcb5b66738b86ff1647 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Mon, 16 Dec 2024 05:10:51 +0000
Subject: [PATCH 048/140] Use dataclass to represent machine type

---
 .../modules/slurm_files/scripts/conf.py       |  13 +-
 .../slurm_files/scripts/tests/common.py       |   3 +-
 .../slurm_files/scripts/tests/test_conf.py    |   4 +-
 .../slurm_files/scripts/tests/test_util.py    |  54 +++++-
 .../modules/slurm_files/scripts/util.py       | 164 ++++++++++--------
 5 files changed, 154 insertions(+), 84 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index a4ff1e488a..dd3d628cbb 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -72,7 +72,7 @@ def get(key, default):
     no_comma_params = get("no_comma_params", False)
 
     any_gpus = any(
-        lkp.template_info(nodeset.instance_template).gpu_count > 0
+        lkp.template_info(nodeset.instance_template).gpu
         for nodeset in lkp.cfg.nodeset.values()
     )
 
@@ -136,7 +136,7 @@ def nodeset_lines(nodeset, lkp: util.Lookup) -> str:
 
     # follow https://slurm.schedmd.com/slurm.conf.html#OPT_Boards
     # by setting Boards, SocketsPerBoard, CoresPerSocket, and ThreadsPerCore
-    gres = f"gpu:{template_info.gpu_count}" if template_info.gpu_count else None
+    gres = f"gpu:{template_info.gpu.count}" if template_info.gpu else None
     node_conf = {
         "RealMemory": machine_conf.memory,
         "Boards": machine_conf.boards,
@@ -360,11 +360,10 @@ def gen_cloud_gres_conf(lkp: util.Lookup) -> None:
 
     gpu_nodes = defaultdict(list)
     for nodeset in lkp.cfg.nodeset.values():
-        template_info = lkp.template_info(nodeset.instance_template)
-        gpu_count = template_info.gpu_count
-        if gpu_count == 0:
-            continue
-        gpu_nodes[gpu_count].append(lkp.nodelist(nodeset))
+        ti = lkp.template_info(nodeset.instance_template)
+        gpu_count = ti.gpu.count if ti.gpu  else 0
+        if gpu_count:
+            gpu_nodes[gpu_count].append(lkp.nodelist(nodeset))
 
     lines = [
         dict_to_conf(
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
index 54d7f45d43..643712efa7 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
@@ -20,6 +20,7 @@
 if SCRIPTS_DIR not in sys.path:
     sys.path.append(SCRIPTS_DIR)  # TODO: make this more robust
 
+import util
 
 # TODO: use "real" classes once they are defined (instead of NSDict)
 
@@ -79,7 +80,7 @@ class TstMachineConf:
 
 @dataclass
 class TstTemplateInfo:
-    gpu_count: int = 0
+    gpu: Optional[util.AcceleratorInfo]
 
 @dataclass
 class TstInstance:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py
index 6585b2fcd1..a8ea8c1c13 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py
@@ -44,7 +44,9 @@ def test_nodeset_lines():
         node_conf={"red": "velvet", "CPUs": 55},
     )
     lkp = util.Lookup(TstCfg())
-    lkp.template_info = Mock(return_value=TstTemplateInfo(gpu_count=33))
+    lkp.template_info = Mock(return_value=TstTemplateInfo(
+        gpu=util.AcceleratorInfo(type="Popov", count=33)
+    ))
     mc = TstMachineConf(
         cpus=5,
         memory=6,
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
index b6e73526f9..40e2cd947d 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
@@ -18,7 +18,7 @@
 from mock import Mock
 from common import TstNodeset, TstCfg # needed to import util
 import util
-from util import NodeState
+from util import NodeState, MachineType, AcceleratorInfo
 from datetime import timedelta
 from google.api_core.client_options import ClientOptions  # noqa: E402
 
@@ -348,3 +348,55 @@ def test_node_state(node: str, state: Optional[NodeState], want: NodeState | Non
     else:
         assert lkp.node_state(node) == want
         
+
+
+@pytest.mark.parametrize(
+    "jo,want",
+    [
+        ({
+            "accelerators": [ { "guestAcceleratorCount": 1, "guestAcceleratorType": "nvidia-tesla-a100" } ],
+            "creationTimestamp": "1969-12-31T16:00:00.000-08:00",
+            "description": "Accelerator Optimized: 1 NVIDIA Tesla A100 GPU, 12 vCPUs, 85GB RAM",
+            "guestCpus": 12,
+            "id": "1000012",
+            "imageSpaceGb": 0,
+            "isSharedCpu": False,
+            "kind": "compute#machineType",
+            "maximumPersistentDisks": 128,
+            "maximumPersistentDisksSizeGb": "263168",
+            "memoryMb": 87040,
+            "name": "a2-highgpu-1g",
+            "selfLink": "https://www.googleapis.com/compute/v1/projects/io-playground/zones/us-central1-a/machineTypes/a2-highgpu-1g",
+            "zone": "us-central1-a"
+        }, MachineType(
+            name="a2-highgpu-1g",
+            guest_cpus=12,
+            memory_mb=87040,
+            accelerators=[
+                AcceleratorInfo(type="nvidia-tesla-a100", count=1)
+            ]
+        )),
+        ({
+            "architecture": "X86_64",
+            "creationTimestamp": "1969-12-31T16:00:00.000-08:00",
+            "description": "8 vCPUs, 32 GB RAM",
+            "guestCpus": 8,
+            "id": "1210008",
+            "imageSpaceGb": 0,
+            "isSharedCpu": False,
+            "kind": "compute#machineType",
+            "maximumPersistentDisks": 128,
+            "maximumPersistentDisksSizeGb": "263168",
+            "memoryMb": 32768,
+            "name": "t2d-standard-8",
+            "selfLink": "https://www.googleapis.com/compute/v1/projects/io-playground/zones/europe-north2-b/machineTypes/t2d-standard-8",
+            "zone": "europe-north2-b"
+        }, MachineType(
+            name="t2d-standard-8",
+            guest_cpus=8,
+            memory_mb=32768,
+            accelerators=[]
+        )),
+    ])
+def test_MachineType_from_json(jo: dict, want: MachineType):
+    assert MachineType.from_json(jo) == want
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index 62fafde6e1..96955309d3 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -129,6 +129,65 @@ class ApiEndpoint(Enum):
     SECRET = "secret_manager"
 
 
+@dataclass(frozen=True)
+class AcceleratorInfo:
+    type: str
+    count: int
+
+    @classmethod
+    def from_json(cls, jo: dict) -> "AcceleratorInfo":
+        return cls(
+            type=jo["guestAcceleratorType"],
+            count=jo["guestAcceleratorCount"])
+
+@dataclass(frozen=True)
+class MachineType:
+    name: str
+    guest_cpus: int
+    memory_mb: int
+    accelerators: List[AcceleratorInfo]
+    
+    @classmethod
+    def from_json(cls, jo: dict) -> "MachineType":
+        return cls(
+            name=jo["name"],
+            guest_cpus=jo["guestCpus"],
+            memory_mb=jo["memoryMb"],
+            accelerators=[
+                AcceleratorInfo.from_json(a) for a in jo.get("accelerators", [])],
+        )
+
+    @property
+    def family(self) -> str:
+        # TODO: doesn't work with N1 custom machine types
+        # See https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#create
+        return self.name.split("-")[0]
+    
+    @property
+    def supports_smt(self) -> bool:
+        # https://cloud.google.com/compute/docs/cpu-platforms
+        if self.family in  ("t2a", "t2d", "h3", "c4a",):
+            return False
+        if self.guest_cpus == 1:
+            return False
+        return True
+    
+    @property
+    def sockets(self) -> int:
+        return {
+            "h3": 2,
+            "c2d": 2 if self.guest_cpus > 56 else 1,
+            "a3": 2,
+            "c2": 2 if self.guest_cpus > 30 else 1,
+            "c3": 2 if self.guest_cpus > 88 else 1,
+            "c3d": 2 if self.guest_cpus > 180 else 1,
+            "c4": 2 if self.guest_cpus > 96 else 1,
+        }.get(
+            self.family, 1,  # assume 1 socket for all other families
+        )
+
+
+
 @lru_cache(maxsize=1)
 def default_credentials():
     return google.auth.default()[0]
@@ -1111,46 +1170,8 @@ def get_insert_operations(group_ids):
     return get_filtered_operations(" AND ".join(f"({f})" for f in filters if f))
 
 
-def machine_type_family(mt: str) -> str:
-    """get machine type family from machine type"""
-    # TODO: doesn't work with N1 custom machine types
-    # See https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#create
-    return mt.split("-")[0]
-
-
-def machine_type_sockets(template) -> int:
-    guestCpus: int = int(template.machine_info.guestCpus)
-    return {
-        "h3": 2,
-        "c2d": 2 if guestCpus > 56 else 1,
-        "a3": 2,
-        "c2": 2 if guestCpus > 30 else 1,
-        "c3": 2 if guestCpus > 88 else 1,
-        "c3d": 2 if guestCpus > 180 else 1,
-        "c4": 2 if guestCpus > 96 else 1,
-    }.get(
-        machine_type_family(template.machineType),
-        1,  # assume 1 socket for all other families
-    )
-
-
-def isSmt(template) -> bool:
-    # https://cloud.google.com/compute/docs/cpu-platforms
-    noSmtFamily = (
-        "t2a",
-        "t2d",
-        "h3",
-        "c4a",
-    )
-    if machine_type_family(template.machineType) in noSmtFamily:
-        return False
-    if template.machine_info.guestCpus == 1:
-        return False
-    return True
-
-
 def getThreadsPerCore(template) -> int:
-    if not isSmt(template):
+    if not template.machine_type.supports_smt:
         return 1
     return template.advancedMachineFeatures.threadsPerCore or 2
 
@@ -1650,53 +1671,48 @@ def machine_types(self):
             op = act.aggregatedList_next(op, result)
         return machines
 
-    def machine_type(self, machine_type: str):
-        """ """
+    def machine_type(self, name: str) -> MachineType:
         custom_patt = re.compile(
             r"((?P<family>\w+)-)?custom-(?P<cpus>\d+)-(?P<mem>\d+)"
         )
-        custom_match = custom_patt.match(machine_type)
-        if custom_match is not None:
-            groups = custom_match.groupdict()
-            cpus, mem = (groups[k] for k in ["cpus", "mem"])
-            machine_info = {
-                "guestCpus": int(cpus),
-                "memoryMb": int(mem),
-            }
-        else:
-            machines = self.machine_types()
-            if machine_type not in machines:
-                raise Exception(f"machine type {machine_type} not found")
-            per_zone = machines[machine_type]
-            assert per_zone
-            machine_info = next(iter(per_zone.values())) # pick the first/any zone
-        return NSDict(machine_info)
+        if match := custom_patt.match(name):
+            return MachineType(
+                name=name,
+                guest_cpus=int(match.group("cpus")),
+                memory_mb=int(match.group("mem")),
+                accelerators=[],
+            )
+        
+        machines = self.machine_types()
+        if name not in machines:
+            raise Exception(f"machine type {name} not found")
+        per_zone = machines[name]
+        assert per_zone
+        return MachineType.from_json(
+            next(iter(per_zone.values())) # pick the first/any zone
+        )
 
     def template_machine_conf(self, template_link):
         template = self.template_info(template_link)
-        if not template.machineType:
-            temp_name = trim_self_link(template_link)
-            raise Exception(f"instance template {temp_name} has no machine type")
-        template.machine_info = self.machine_type(template.machineType)
-        machine = template.machine_info
+        machine = template.machine_type
 
         machine_conf = NSDict()
         machine_conf.boards = 1  # No information, assume 1
-        machine_conf.sockets = machine_type_sockets(template)
+        machine_conf.sockets = machine.sockets
         # the value below for SocketsPerBoard must be type int
         machine_conf.sockets_per_board = machine_conf.sockets // machine_conf.boards
         machine_conf.threads_per_core = 1
         _div = 2 if getThreadsPerCore(template) == 1 else 1
         machine_conf.cpus = (
-            int(machine.guestCpus / _div) if isSmt(template) else machine.guestCpus
+            int(machine.guest_cpus / _div) if machine.supports_smt else machine.guest_cpus
         )
         machine_conf.cores_per_socket = int(machine_conf.cpus / machine_conf.sockets)
         # Because the actual memory on the host will be different than
         # what is configured (e.g. kernel will take it). From
         # experiments, about 16 MB per GB are used (plus about 400 MB
         # buffer for the first couple of GB's. Using 30 MB to be safe.
-        gb = machine.memoryMb // 1024
-        machine_conf.memory = machine.memoryMb - (400 + (30 * gb))
+        gb = machine.memory_mb // 1024
+        machine_conf.memory = machine.memory_mb - (400 + (30 * gb))
         return machine_conf
 
     @contextmanager
@@ -1741,20 +1757,20 @@ def template_info(self, template_link):
         # name and link are not in properties, so stick them in
         template.name = template_name
         template.link = template_link
+        template.machine_type = self.machine_type(template.machineType)
         # TODO delete metadata to reduce memory footprint?
         # del template.metadata
 
         # translate gpus into an easier-to-read format
-        machine_info = self.machine_type(template.machineType)
-        if machine_info.accelerators:
-            template.gpu_type = machine_info.accelerators[0].guestAcceleratorType
-            template.gpu_count = machine_info.accelerators[0].guestAcceleratorCount
+        if template.machine_type.accelerators:
+            template.gpu = template.machine_type.accelerators[0]
         elif template.guestAccelerators:
-            template.gpu_type = template.guestAccelerators[0].acceleratorType
-            template.gpu_count = template.guestAccelerators[0].acceleratorCount
+            tga = template.guestAccelerators[0]
+            template.gpu = AcceleratorInfo(
+                type=tga.acceleratorType,
+                count=tga.acceleratorCount)
         else:
-            template.gpu_type = None
-            template.gpu_count = 0
+            template.gpu = None
 
         # keep write access open for minimum time
         with self.template_cache(writeback=True) as cache:

From 842f48380510c3b0c2b00484448d963579e10c18 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Thu, 19 Dec 2024 00:58:05 +0000
Subject: [PATCH 049/140] Add terraform setup to github workflow config

---
 .github/workflows/pr-precommit.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml
index 5b1b5091cf..272fb524f4 100644
--- a/.github/workflows/pr-precommit.yml
+++ b/.github/workflows/pr-precommit.yml
@@ -41,6 +41,10 @@ jobs:
       with:
         go-version: '1.22'
         check-latest: true
+    - uses: hashicorp/setup-terraform@v3
+      with:
+        terraform_version: "1.5.7"
+        terraform_wrapper: false
     - run: make install-dev-deps
     - uses: terraform-linters/setup-tflint@v4
       with:

From 8e54adc3cf1946dc4e2e89cac009e4883fb023d1 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Wed, 18 Dec 2024 21:54:22 +0000
Subject: [PATCH 050/140] Add validation to prevent creation of empty nodesets

---
 .../compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf      | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
index ad78840a38..5781d2415c 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
@@ -75,7 +75,14 @@ output "nodeset" {
   precondition {
     condition     = var.future_reservation == "" || local.fr_zone == var.zone
     error_message = <<-EOD
-      The zone of the deployment must match that of the future reservation"
+      The zone of the deployment must match that of the future reservation
+    EOD
+  }
+
+  precondition {
+    condition     = var.node_count_dynamic_max > 0 || var.node_count_static > 0
+    error_message = <<-EOD
+      This nodeset contains zero nodes, there should be at least one static or dynamic node
     EOD
   }
 }

From b4ac5130739300f8a8a78d50dcfe537d06af5307 Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Thu, 19 Dec 2024 06:30:13 +0000
Subject: [PATCH 051/140] Fix gke parallelstore blueprint name going beyond
 network char limit

---
 examples/README.md                                        | 4 ++--
 ...-parallelstore.yaml => gke-managed-parallelstore.yaml} | 2 +-
 modules/file-system/gke-storage/README.md                 | 2 +-
 ...-parallelstore.yaml => gke-managed-parallelstore.yaml} | 6 +++---
 ...ed-parallelstore.yml => gke-managed-parallelstore.yml} | 8 ++++----
 5 files changed, 11 insertions(+), 11 deletions(-)
 rename examples/{gke-storage-managed-parallelstore.yaml => gke-managed-parallelstore.yaml} (98%)
 rename tools/cloud-build/daily-tests/builds/{gke-storage-managed-parallelstore.yaml => gke-managed-parallelstore.yaml} (90%)
 rename tools/cloud-build/daily-tests/tests/{gke-storage-managed-parallelstore.yml => gke-managed-parallelstore.yml} (77%)

diff --git a/examples/README.md b/examples/README.md
index 46ab3d11c0..29db27df94 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1518,7 +1518,7 @@ cleaned up when the job is deleted.
 
 [storage-gke.yaml]: ../examples/storage-gke.yaml
 
-### [gke-storage-managed-parallelstore.yaml] ![core-badge] ![experimental-badge]
+### [gke-managed-parallelstore.yaml] ![core-badge] ![experimental-badge]
 
 This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit.
 
@@ -1540,7 +1540,7 @@ The blueprint contains the following:
 > `--vars authorized_cidr=<your-ip-address>/32`.** You can use a service like
 > [whatismyip.com](https://whatismyip.com) to determine your IP address.
 
-[gke-storage-managed-parallelstore.yaml]: ../examples/gke-storage-managed-parallelstore.yaml
+[gke-managed-parallelstore.yaml]: ../examples/gke-managed-parallelstore.yaml
 
 ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge]
 
diff --git a/examples/gke-storage-managed-parallelstore.yaml b/examples/gke-managed-parallelstore.yaml
similarity index 98%
rename from examples/gke-storage-managed-parallelstore.yaml
rename to examples/gke-managed-parallelstore.yaml
index 414a2b180d..4425f13181 100644
--- a/examples/gke-storage-managed-parallelstore.yaml
+++ b/examples/gke-managed-parallelstore.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-blueprint_name: gke-storage-managed-parallelstore
+blueprint_name: gke-managed-parallelstore
 vars:
   project_id:  ## Set GCP Project ID Here ##
   deployment_name: gke-storage-managed-ps
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index f4ebd8add0..fc65e76d4d 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -39,7 +39,7 @@ then use them in a `gke-job-template` to dynamically provision the resource.
 ```
 
 See example
-[gke-storage-managed-parallelstore.yaml](../../../examples/README.md#gke-storage-managed-parallelstoreyaml--) blueprint
+[gke-managed-parallelstore.yaml](../../../examples/README.md#gke-managed-parallelstoreyaml--) blueprint
 for a complete example.
 
 ### Authorized Network
diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml
similarity index 90%
rename from tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml
rename to tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml
index 8fbc9c1794..01010a0435 100644
--- a/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml
@@ -27,7 +27,7 @@ timeout: 14400s  # 4hr
 
 steps:
 ## Test GKE
-- id: gke-storage-managed-parallelstore
+- id: gke-managed-parallelstore
   name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
   entrypoint: /bin/bash
   env:
@@ -40,7 +40,7 @@ steps:
     cd /workspace && make
     BUILD_ID_FULL=$BUILD_ID
     BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-    SG_EXAMPLE=examples/gke-storage-managed-parallelstore.yaml
+    SG_EXAMPLE=examples/gke-managed-parallelstore.yaml
 
     # adding vm to act as remote node
     echo '  - id: remote-node'                     >> $${SG_EXAMPLE}
@@ -58,4 +58,4 @@ steps:
 
     ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
       --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml"
+      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml
similarity index 77%
rename from tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml
rename to tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml
index bfb8bc32d7..cd9e7f712b 100644
--- a/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml
+++ b/tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-test_name: gke-storage-managed-parallelstore
-deployment_name: gke-storage-managed-parallelstore-{{ build }}
+test_name: gke-managed-parallelstore
+deployment_name: gke-managed-parallelstore-{{ build }}
 zone: us-central1-a  # for remote node
 region: us-central1
 workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/gke-storage-managed-parallelstore.yaml"
+blueprint_yaml: "{{ workspace }}/examples/gke-managed-parallelstore.yaml"
 network: "{{ deployment_name }}-net"
 remote_node: "{{ deployment_name }}-0"
 post_deploy_tests:
-- test-validation/test-gke-storage-managed-parallelstore.yml
+- test-validation/test-gke-managed-parallelstore.yml
 custom_vars:
   project: "{{ project }}"
 cli_deployment_vars:

From c5cf2159d808d3948c200d2b48d98a5a4561138a Mon Sep 17 00:00:00 2001
From: Mohit Chaurasia <mohitchaurasia@google.com>
Date: Thu, 19 Dec 2024 07:15:43 +0000
Subject: [PATCH 052/140] Updated ansible playbook test file name

---
 ...orage-parallelstore.yml => test-gke-managed-parallelstore.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tools/cloud-build/daily-tests/ansible_playbooks/test-validation/{test-gke-storage-parallelstore.yml => test-gke-managed-parallelstore.yml} (100%)

diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-parallelstore.yml
similarity index 100%
rename from tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml
rename to tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-parallelstore.yml

From deea05eb38a58e255720ce6d7f12c5e81cde76b3 Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Thu, 19 Dec 2024 09:10:06 +0000
Subject: [PATCH 053/140] upgrade a3-ultra to use kueue v0.10.0

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 72bf5e2bf2..2eb10b679c 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -176,7 +176,7 @@ deployment_groups:
     settings:
       kueue:
         install: true
-        version: v0.9.1
+        version: v0.10.0
       jobset:
         install: true
         version: v0.7.1

From 862e19b85bfa3c495e7db7f35d6d60f6245faa06 Mon Sep 17 00:00:00 2001
From: Parul Bajaj <parulbajaj@google.com>
Date: Thu, 19 Dec 2024 12:49:12 +0000
Subject: [PATCH 054/140] Add compact placement validations

---
 modules/compute/gke-node-pool/main.tf | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f1999cbd0b..9a09712097 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -307,6 +307,14 @@ resource "google_container_node_pool" "node_pool" {
       condition     = local.upgrade_settings.max_unavailable > 0 || local.upgrade_settings.max_surge > 0
       error_message = "At least one of max_unavailable or max_surge must greater than 0"
     }
+    precondition {
+      condition     = var.placement_policy.type != "COMPACT" || length(var.zones) == 1
+      error_message = "Compact placement is only available for node pools operating in a single zone."
+    }
+    precondition {
+      condition     = var.placement_policy.type != "COMPACT" || local.upgrade_settings.strategy != "BLUE_GREEN"
+      error_message = "Compact placement is not supported with blue-green upgrades."
+    }
   }
 }
 

From 9658bbc735a7dd87afa02935049b1eaae6a9de46 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 19 Dec 2024 20:00:56 +0000
Subject: [PATCH 055/140] Rename `/community/module/internal/slurm-gcp-v6` to
 `slurm-gcp`

No other changes
---
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md     |  2 +-
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf       |  2 +-
 .../{slurm-gcp-v6 => slurm-gcp}/instance/README.md     |  0
 .../{slurm-gcp-v6 => slurm-gcp}/instance/main.tf       |  0
 .../{slurm-gcp-v6 => slurm-gcp}/instance/outputs.tf    |  0
 .../{slurm-gcp-v6 => slurm-gcp}/instance/variables.tf  |  0
 .../{slurm-gcp-v6 => slurm-gcp}/instance/versions.tf   |  0
 .../instance_template/README.md                        |  0
 .../instance_template/files/startup_sh_unlinted        |  0
 .../instance_template/main.tf                          |  0
 .../instance_template/outputs.tf                       |  0
 .../instance_template/variables.tf                     |  0
 .../instance_template/versions.tf                      |  0
 .../internal_instance_template/README.md               |  0
 .../internal_instance_template/main.tf                 |  0
 .../internal_instance_template/outputs.tf              |  0
 .../internal_instance_template/variables.tf            |  0
 .../internal_instance_template/versions.tf             |  0
 .../{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/README.md  |  0
 .../{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/main.tf    |  0
 .../{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/outputs.tf |  0
 .../nodeset_tpu/variables.tf                           |  0
 .../nodeset_tpu/versions.tf                            |  0
 .../schedmd-slurm-gcp-v6-controller/README.md          | 10 +++++-----
 .../schedmd-slurm-gcp-v6-controller/controller.tf      |  2 +-
 .../scheduler/schedmd-slurm-gcp-v6-controller/login.tf |  4 ++--
 .../modules/slurm_files/scripts/slurmsync.py           |  1 +
 .../schedmd-slurm-gcp-v6-controller/partition.tf       |  4 ++--
 28 files changed, 13 insertions(+), 12 deletions(-)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance/README.md (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance/main.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance/outputs.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance/variables.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance/versions.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/README.md (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/files/startup_sh_unlinted (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/main.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/outputs.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/variables.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/instance_template/versions.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/internal_instance_template/README.md (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/internal_instance_template/main.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/internal_instance_template/outputs.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/internal_instance_template/variables.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/internal_instance_template/versions.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/README.md (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/main.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/outputs.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/variables.tf (100%)
 rename community/modules/internal/{slurm-gcp-v6 => slurm-gcp}/nodeset_tpu/versions.tf (100%)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
index 643ef9ad84..50f0cbc6e0 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
@@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp-v6/instance_template | n/a |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp/instance_template | n/a |
 
 ## Resources
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
index 6dcc872cab..a528978760 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
@@ -56,7 +56,7 @@ locals {
 }
 
 module "slurm_nodeset_template" {
-  source = "../../internal/slurm-gcp-v6/instance_template"
+  source = "../../internal/slurm-gcp/instance_template"
 
   project_id          = var.project_id
   region              = var.region
diff --git a/community/modules/internal/slurm-gcp-v6/instance/README.md b/community/modules/internal/slurm-gcp/instance/README.md
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance/README.md
rename to community/modules/internal/slurm-gcp/instance/README.md
diff --git a/community/modules/internal/slurm-gcp-v6/instance/main.tf b/community/modules/internal/slurm-gcp/instance/main.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance/main.tf
rename to community/modules/internal/slurm-gcp/instance/main.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance/outputs.tf b/community/modules/internal/slurm-gcp/instance/outputs.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance/outputs.tf
rename to community/modules/internal/slurm-gcp/instance/outputs.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance/variables.tf b/community/modules/internal/slurm-gcp/instance/variables.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance/variables.tf
rename to community/modules/internal/slurm-gcp/instance/variables.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance/versions.tf b/community/modules/internal/slurm-gcp/instance/versions.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance/versions.tf
rename to community/modules/internal/slurm-gcp/instance/versions.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/README.md b/community/modules/internal/slurm-gcp/instance_template/README.md
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/README.md
rename to community/modules/internal/slurm-gcp/instance_template/README.md
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/files/startup_sh_unlinted b/community/modules/internal/slurm-gcp/instance_template/files/startup_sh_unlinted
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/files/startup_sh_unlinted
rename to community/modules/internal/slurm-gcp/instance_template/files/startup_sh_unlinted
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/main.tf b/community/modules/internal/slurm-gcp/instance_template/main.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/main.tf
rename to community/modules/internal/slurm-gcp/instance_template/main.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/outputs.tf b/community/modules/internal/slurm-gcp/instance_template/outputs.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/outputs.tf
rename to community/modules/internal/slurm-gcp/instance_template/outputs.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/variables.tf b/community/modules/internal/slurm-gcp/instance_template/variables.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/variables.tf
rename to community/modules/internal/slurm-gcp/instance_template/variables.tf
diff --git a/community/modules/internal/slurm-gcp-v6/instance_template/versions.tf b/community/modules/internal/slurm-gcp/instance_template/versions.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/instance_template/versions.tf
rename to community/modules/internal/slurm-gcp/instance_template/versions.tf
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md b/community/modules/internal/slurm-gcp/internal_instance_template/README.md
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/internal_instance_template/README.md
rename to community/modules/internal/slurm-gcp/internal_instance_template/README.md
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf b/community/modules/internal/slurm-gcp/internal_instance_template/main.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/internal_instance_template/main.tf
rename to community/modules/internal/slurm-gcp/internal_instance_template/main.tf
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/outputs.tf b/community/modules/internal/slurm-gcp/internal_instance_template/outputs.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/internal_instance_template/outputs.tf
rename to community/modules/internal/slurm-gcp/internal_instance_template/outputs.tf
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf b/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/internal_instance_template/variables.tf
rename to community/modules/internal/slurm-gcp/internal_instance_template/variables.tf
diff --git a/community/modules/internal/slurm-gcp-v6/internal_instance_template/versions.tf b/community/modules/internal/slurm-gcp/internal_instance_template/versions.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/internal_instance_template/versions.tf
rename to community/modules/internal/slurm-gcp/internal_instance_template/versions.tf
diff --git a/community/modules/internal/slurm-gcp-v6/nodeset_tpu/README.md b/community/modules/internal/slurm-gcp/nodeset_tpu/README.md
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/nodeset_tpu/README.md
rename to community/modules/internal/slurm-gcp/nodeset_tpu/README.md
diff --git a/community/modules/internal/slurm-gcp-v6/nodeset_tpu/main.tf b/community/modules/internal/slurm-gcp/nodeset_tpu/main.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/nodeset_tpu/main.tf
rename to community/modules/internal/slurm-gcp/nodeset_tpu/main.tf
diff --git a/community/modules/internal/slurm-gcp-v6/nodeset_tpu/outputs.tf b/community/modules/internal/slurm-gcp/nodeset_tpu/outputs.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/nodeset_tpu/outputs.tf
rename to community/modules/internal/slurm-gcp/nodeset_tpu/outputs.tf
diff --git a/community/modules/internal/slurm-gcp-v6/nodeset_tpu/variables.tf b/community/modules/internal/slurm-gcp/nodeset_tpu/variables.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/nodeset_tpu/variables.tf
rename to community/modules/internal/slurm-gcp/nodeset_tpu/variables.tf
diff --git a/community/modules/internal/slurm-gcp-v6/nodeset_tpu/versions.tf b/community/modules/internal/slurm-gcp/nodeset_tpu/versions.tf
similarity index 100%
rename from community/modules/internal/slurm-gcp-v6/nodeset_tpu/versions.tf
rename to community/modules/internal/slurm-gcp/nodeset_tpu/versions.tf
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index 583d41825e..b03fbf0973 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -261,12 +261,12 @@ limitations under the License.
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
 | <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
-| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | ../../internal/slurm-gcp-v6/instance_template | n/a |
+| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | ../../internal/slurm-gcp/instance_template | n/a |
 | <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
-| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | ../../internal/slurm-gcp-v6/instance | n/a |
-| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | ../../internal/slurm-gcp-v6/instance_template | n/a |
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp-v6/instance_template | n/a |
-| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | ../../internal/slurm-gcp-v6/nodeset_tpu | n/a |
+| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | ../../internal/slurm-gcp/instance | n/a |
+| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | ../../internal/slurm-gcp/instance_template | n/a |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp/instance_template | n/a |
+| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | ../../internal/slurm-gcp/nodeset_tpu | n/a |
 
 ## Resources
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
index fa28b8728f..c98813a722 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
@@ -43,7 +43,7 @@ locals {
 
 # INSTANCE TEMPLATE
 module "slurm_controller_template" {
-  source = "../../internal/slurm-gcp-v6/instance_template"
+  source = "../../internal/slurm-gcp/instance_template"
 
   project_id          = var.project_id
   region              = var.region
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
index cfb61787cb..874d1aff67 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
@@ -14,7 +14,7 @@
 
 # TEMPLATE
 module "slurm_login_template" {
-  source = "../../internal/slurm-gcp-v6/instance_template"
+  source = "../../internal/slurm-gcp/instance_template"
 
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
@@ -56,7 +56,7 @@ module "slurm_login_template" {
 
 # INSTANCE
 module "slurm_login_instance" {
-  source   = "../../internal/slurm-gcp-v6/instance"
+  source   = "../../internal/slurm-gcp/instance"
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
   access_config = each.value.access_config
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index 65bf15ede5..05fb2d5805 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -441,6 +441,7 @@ def delete_reservation(lkp: util.Lookup, reservation_name: str) -> None:
 def create_reservation(lkp: util.Lookup, reservation_name: str, node: str, start_time: datetime) -> None:
     # Format time to be compatible with slurm reservation.
     formatted_start_time = start_time.strftime('%Y-%m-%dT%H:%M:%S')
+    
     util.run(f"{lkp.scontrol} create reservation user=slurm starttime={formatted_start_time} duration=180 nodes={node} reservationname={reservation_name} flags=maint,ignore_jobs")
 
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index e8626bd1bd..308b60d19d 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -26,7 +26,7 @@ locals {
 # NODESET
 # TODO: remove dependency on slurm-gcp repo, move to local template module
 module "slurm_nodeset_template" {
-  source   = "../../internal/slurm-gcp-v6/instance_template"
+  source   = "../../internal/slurm-gcp/instance_template"
   for_each = local.nodeset_map
 
   project_id          = var.project_id
@@ -104,7 +104,7 @@ locals {
 
 # NODESET TPU
 module "slurm_nodeset_tpu" {
-  source   = "../../internal/slurm-gcp-v6/nodeset_tpu"
+  source   = "../../internal/slurm-gcp/nodeset_tpu"
   for_each = local.nodeset_tpu_map
 
   project_id             = var.project_id

From 697b70a4739bdc2c047990dd3410c7139b3a9e73 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 19 Dec 2024 21:22:41 +0000
Subject: [PATCH 056/140] Fix wrong API field name

---
 .../modules/slurm_files/scripts/resume.py                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 4f3fce7213..fa5413e53c 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -129,7 +129,7 @@ def update_reservation_props(reservation:object, props:object, placement_group:O
     }
 
     if reservation.dense or reservation_from_fr:
-        props.scheduling.provisioning_model = "RESERVATION_BOUND"
+        props.scheduling.provisioningModel = "RESERVATION_BOUND"
 
     # Figure out `resourcePolicies`
     if reservation.policies: # use ones already attached to reservations

From ba673d54668429e44878a51aec68ed1c892bc8cc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 19 Dec 2024 22:54:36 +0000
Subject: [PATCH 057/140] Bump golang.org/x/crypto from 0.25.0 to 0.31.0

Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.25.0 to 0.31.0.
- [Commits](https://github.com/golang/crypto/compare/v0.25.0...v0.31.0)

---
updated-dependencies:
- dependency-name: golang.org/x/crypto
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 go.mod |  8 ++++----
 go.sum | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/go.mod b/go.mod
index 61a5f523e6..8e9d4e4d7c 100644
--- a/go.mod
+++ b/go.mod
@@ -51,7 +51,7 @@ require (
 	go.opentelemetry.io/otel/metric v1.24.0 // indirect
 	go.opentelemetry.io/otel/trace v1.24.0 // indirect
 	golang.org/x/mod v0.19.0 // indirect
-	golang.org/x/sync v0.7.0 // indirect
+	golang.org/x/sync v0.10.0 // indirect
 	golang.org/x/time v0.5.0 // indirect
 	golang.org/x/tools v0.23.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect
@@ -95,11 +95,11 @@ require (
 	github.com/ulikunitz/xz v0.5.10 // indirect
 	github.com/xanzy/ssh-agent v0.3.3 // indirect
 	go.opencensus.io v0.24.0 // indirect
-	golang.org/x/crypto v0.25.0 // indirect
+	golang.org/x/crypto v0.31.0 // indirect
 	golang.org/x/net v0.27.0 // indirect
 	golang.org/x/oauth2 v0.21.0 // indirect
-	golang.org/x/sys v0.27.0
-	golang.org/x/text v0.16.0 // indirect
+	golang.org/x/sys v0.28.0
+	golang.org/x/text v0.21.0 // indirect
 	google.golang.org/grpc v1.64.1 // indirect
 	google.golang.org/protobuf v1.34.2 // indirect
 	gopkg.in/warnings.v0 v0.1.2 // indirect
diff --git a/go.sum b/go.sum
index 1e4a67b6ba..bdf95899c5 100644
--- a/go.sum
+++ b/go.sum
@@ -529,8 +529,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30=
-golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M=
+golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -662,8 +662,8 @@ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
-golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -732,13 +732,13 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
-golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk=
-golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4=
+golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -749,8 +749,8 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
-golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
-golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
+golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=

From f0272b792d0f2a8171bcd74287992173533feb6a Mon Sep 17 00:00:00 2001
From: Nick Stroud <nickstroud@google.com>
Date: Thu, 19 Dec 2024 15:13:44 -0800
Subject: [PATCH 058/140] Update google tf provider version ceiling

---
 pkg/config/expand.go                                      | 4 ++--
 pkg/config/expand_test.go                                 | 4 ++--
 .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml       | 4 ++--
 .../golden_copies/expectations/igc_pkr/zero/versions.tf   | 4 ++--
 .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml        | 8 ++++----
 .../golden_copies/expectations/igc_tf/one/versions.tf     | 4 ++--
 .../golden_copies/expectations/igc_tf/zero/versions.tf    | 4 ++--
 .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++--
 .../expectations/merge_flatten/zero/versions.tf           | 4 ++--
 .../.ghpc/artifacts/expanded_blueprint.yaml               | 4 ++--
 .../expectations/versioned_blueprint/primary/versions.tf  | 4 ++--
 11 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index ae5c30a328..95ad1de52e 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 	return map[string]TerraformProvider{
 		"google": {
 			Source:        "hashicorp/google",
-			Version:       "~> 6.13.0",
+			Version:       "~> 6.14.0",
 			Configuration: gglConf},
 		"google-beta": {
 			Source:        "hashicorp/google-beta",
-			Version:       "~> 6.13.0",
+			Version:       "~> 6.14.0",
 			Configuration: gglConf}}
 }
 
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index e1ad008407..73f321cef0 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		c.Check(g.TerraformProviders, DeepEquals, map[string]PR{
 			"google": TerraformProvider{
 				Source:  "hashicorp/google",
-				Version: "~> 6.13.0"},
+				Version: "~> 6.14.0"},
 			"google-beta": TerraformProvider{
 				Source:  "hashicorp/google-beta",
-				Version: "~> 6.13.0"}})
+				Version: "~> 6.14.0"}})
 	}
 
 	{ // no def PR, group PR
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
index dd66cf7aa1..b633bbafb0 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
@@ -38,14 +38,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
index fab3c44cd0..df921aaeb4 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
index 1906e9a832..8fcb8e46d7 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
@@ -44,14 +44,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
@@ -80,14 +80,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
index fab3c44cd0..df921aaeb4 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
index fab3c44cd0..df921aaeb4 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
index 15a203a4b5..c81fa3e335 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
@@ -39,14 +39,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
index fab3c44cd0..df921aaeb4 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
index 0a51078be5..cfb228725d 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
@@ -47,14 +47,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: ~> 6.13.0
+        version: ~> 6.14.0
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
index fab3c44cd0..df921aaeb4 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = "~> 6.13.0"
+      version = "~> 6.14.0"
     }
   }
 }

From e98ad33ddd1d154a989036765a3992924ead2a1e Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Fri, 20 Dec 2024 11:40:12 +0000
Subject: [PATCH 059/140] add tas plugin fix

---
 .../manifests/topology-scheduler-scripts.yaml | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
index cf6cf2fb00..20438bc638 100644
--- a/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
+++ b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
@@ -6,7 +6,6 @@ metadata:
 data:
   schedule-daemon.py: |
     #!/usr/bin/env python
-    """schedule-daemon.py is a Topology-aware Kubernetes pod scheduler."""
 
     # Copyright 2024 Google Inc. All Rights Reserved.
     #
@@ -21,6 +20,7 @@ data:
     # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     # See the License for the specific language governing permissions and
     # limitations under the License.
+    """schedule-daemon.py is a Topology-aware Kubernetes pod scheduler."""
 
     import argparse
     import collections
@@ -293,6 +293,16 @@ data:
           )
           continue
 
+        # skip nodes that is not in Ready state
+        if any(
+          condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions
+          ):
+            logging.info(
+              'Skipping node %s because it is NotReady',
+              node_name
+            )
+            continue
+
         allocatable = node.status.allocatable
         used_cpu, used_memory, used_gpu = 0, 0, 0
 
@@ -445,7 +455,7 @@ data:
         v1: kubernetes.client.CoreV1Api,
         pod_name: str,
         pod_namespace: str,
-        node_name: str,
+        node: dict[str, Any],
         gate_name: str,
     ) -> bool:
       """Schedules a pod on a given node using affinity for direct assignment.
@@ -454,7 +464,7 @@ data:
         v1: The kubernetes client.
         pod_name: The name of the pod to schedule.
         pod_namespace: The namespace of the pod to schedule.
-        node_name: The name of the node to schedule the pod on.
+        node: The node to schedule the pod on.
         gate_name: The name of the gate to remove from the pod.
 
       Returns:
@@ -473,7 +483,7 @@ data:
                           'matchExpressions': [{
                               'key': 'kubernetes.io/hostname',
                               'operator': 'In',
-                              'values': [node_name],
+                              'values': [node['name']],
                           }]
                       }]
                   }
@@ -484,7 +494,7 @@ data:
           v1.replace_namespaced_pod(pod_name, pod_namespace, pod)
 
           logging.info(
-              'Pod %s/%s scheduled on %s', pod_namespace, pod_name, node_name
+              'Pod %s/%s scheduled on %s with topology %s', pod_namespace, pod_name, node['name'], node_topology_key(node)
           )
       except kubernetes.client.rest.ApiException as e:
         logging.exception(
@@ -727,7 +737,7 @@ data:
           for i, pod in enumerate(sorted_pods):
             node = sorted_nodes[best_assignment[i]]
             if not schedule_pod_on_node(
-                v1, pod['name'], pod['namespace'], node['name'], gate_name
+                v1, pod['name'], pod['namespace'], node, gate_name
             ):
               logging.error(
                   'Failed to schedule pod %s on node %s. Skipping job %s',

From a9f4617dc5b793d5c10f0606bc674a7dd9f9f053 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Thu, 21 Nov 2024 06:16:52 +0000
Subject: [PATCH 060/140] Include MemSpecLimit when calculating defmem

---
 .../modules/slurm_files/scripts/conf.py                   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index dd3d628cbb..ef0f747f64 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -184,10 +184,12 @@ def partitionlines(partition, lkp: util.Lookup) -> str:
     """Make a partition line for the slurm.conf"""
     MIN_MEM_PER_CPU = 100
 
-    def defmempercpu(nodeset: str) -> int:
-        template = lkp.cfg.nodeset.get(nodeset).instance_template
+    def defmempercpu(nodeset_name: str) -> int:
+        nodeset = lkp.cfg.nodeset.get(nodeset_name)
+        template = nodeset.instance_template
         machine = lkp.template_machine_conf(template)
-        return max(MIN_MEM_PER_CPU, machine.memory // machine.cpus)
+        mem_spec_limit = int(nodeset.node_conf.get("MemSpecLimit", 0))
+        return max(MIN_MEM_PER_CPU, (machine.memory - mem_spec_limit) // machine.cpus)
 
     defmem = min(
         map(defmempercpu, partition.partition_nodeset), default=MIN_MEM_PER_CPU

From 5c400c79272b13eca12e66e2748708a0b3e36ffc Mon Sep 17 00:00:00 2001
From: In-Ho Yi <chajath@gmail.com>
Date: Fri, 20 Dec 2024 17:27:46 +0000
Subject: [PATCH 061/140] Add lifecycle rule to ignore local SSDs

These settings are known to be tied to machine types and sometimes API
changes prompt change not best tracked by TF states
---
 modules/compute/gke-node-pool/main.tf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index e971af24dc..48064aff98 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -226,6 +226,8 @@ resource "google_container_node_pool" "node_pool" {
     ignore_changes = [
       node_config[0].labels,
       initial_node_count,
+      node_config[0].ephemeral_storage_local_ssd_config[0].local_ssd_count,
+      node_config[0].local_nvme_ssd_block_config[0].local_ssd_count,
     ]
     precondition {
       condition     = (var.max_pods_per_node == null) || (data.google_container_cluster.gke_cluster.networking_mode == "VPC_NATIVE")

From 6c108882ffa787cf72633f9fb8cdfcf287326ff0 Mon Sep 17 00:00:00 2001
From: In-Ho Yi <chajath@gmail.com>
Date: Fri, 20 Dec 2024 17:35:26 +0000
Subject: [PATCH 062/140] Fix non-constant format string errors

---
 cmd/create.go                 | 16 ++++++++--------
 cmd/root.go                   |  2 +-
 pkg/modulereader/hcl_utils.go |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cmd/create.go b/cmd/create.go
index 17ec0eb442..3ea151cdcd 100644
--- a/cmd/create.go
+++ b/cmd/create.go
@@ -91,7 +91,7 @@ func printAdvancedInstructionsMessage(deplDir string) {
 	logging.Info("Find instructions for cleanly destroying infrastructure and advanced manual")
 	logging.Info("deployment instructions at:")
 	logging.Info("")
-	logging.Info(modulewriter.InstructionsPath(deplDir))
+	logging.Info("%s", modulewriter.InstructionsPath(deplDir))
 }
 
 // TODO: move to expand.go
@@ -135,10 +135,10 @@ func v5DeprecationWarning(bp config.Blueprint) {
 	alreadyContainsV5 := false
 	bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
 		if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
-			logging.Info(boldYellow(
-				"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" +
-					"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" +
-					"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" +
+			logging.Info("%s", boldYellow(
+				"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n"+
+					"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n"+
+					"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n"+
 					"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
 			))
 			alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
@@ -152,7 +152,7 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
 	if err == nil {
 		return
 	}
-	logging.Error(renderError(err, ctx))
+	logging.Error("%s", renderError(err, ctx))
 
 	logging.Error("One or more blueprint validators has failed. See messages above for suggested")
 	logging.Error("actions. General troubleshooting guidance and instructions for configuring")
@@ -169,12 +169,12 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
 	switch bp.ValidationLevel {
 	case config.ValidationWarning:
 		{
-			logging.Error(boldYellow("Validation failures were treated as a warning, continuing to create blueprint."))
+			logging.Error("%s", boldYellow("Validation failures were treated as a warning, continuing to create blueprint."))
 			logging.Error("")
 		}
 	case config.ValidationError:
 		{
-			logging.Fatal(boldRed("validation failed due to the issues listed above"))
+			logging.Fatal("%s", boldRed("validation failed due to the issues listed above"))
 		}
 	}
 
diff --git a/cmd/root.go b/cmd/root.go
index d19219fb97..a5ccddfc8f 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -262,6 +262,6 @@ func checkErr(err error, ctx *config.YamlCtx) {
 		ctx = &config.YamlCtx{}
 	}
 	if err != nil {
-		logging.Fatal(renderError(err, *ctx))
+		logging.Fatal("%s", renderError(err, *ctx))
 	}
 }
diff --git a/pkg/modulereader/hcl_utils.go b/pkg/modulereader/hcl_utils.go
index 5119aba662..4e2cb7c11e 100644
--- a/pkg/modulereader/hcl_utils.go
+++ b/pkg/modulereader/hcl_utils.go
@@ -136,7 +136,7 @@ func ReadHclAttributes(file string) (map[string]cty.Value, error) {
 		// work around ugly <nil> in error message missing d.Subject
 		// https://github.com/hashicorp/hcl2/blob/fb75b3253c80b3bc7ca99c4bfa2ad6743841b1af/hcl/diagnostic.go#L76-L78
 		if len(diags) == 1 {
-			return nil, fmt.Errorf(diags[0].Detail)
+			return nil, fmt.Errorf("%s", diags[0].Detail)
 		}
 		return nil, diags
 	}

From ecd4d9ce26221829f4713e8ceac56a7caaaa6510 Mon Sep 17 00:00:00 2001
From: In-Ho Yi <chajath@gmail.com>
Date: Fri, 20 Dec 2024 18:14:32 +0000
Subject: [PATCH 063/140] Fix lifecycle rule to properly ignore local ssd

---
 modules/compute/gke-node-pool/main.tf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index 0001d204e0..b0bb2c8e30 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -235,8 +235,9 @@ resource "google_container_node_pool" "node_pool" {
     ignore_changes = [
       node_config[0].labels,
       initial_node_count,
-      node_config[0].ephemeral_storage_local_ssd_config[0].local_ssd_count,
-      node_config[0].local_nvme_ssd_block_config[0].local_ssd_count,
+      # Ignore local/ephemeral ssd configs as they are tied to machine types.
+      node_config[0].ephemeral_storage_local_ssd_config,
+      node_config[0].local_nvme_ssd_block_config,
     ]
     precondition {
       condition     = (var.max_pods_per_node == null) || (data.google_container_cluster.gke_cluster.networking_mode == "VPC_NATIVE")

From 3339f2bf3cdb8fa8e43cf85943e0fb0559b9ef22 Mon Sep 17 00:00:00 2001
From: In-Ho Yi <chajath@gmail.com>
Date: Fri, 20 Dec 2024 21:10:34 +0000
Subject: [PATCH 064/140] Use errors.New to report hcl parse error

---
 pkg/modulereader/hcl_utils.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/modulereader/hcl_utils.go b/pkg/modulereader/hcl_utils.go
index 4e2cb7c11e..9f4ded6a34 100644
--- a/pkg/modulereader/hcl_utils.go
+++ b/pkg/modulereader/hcl_utils.go
@@ -15,6 +15,7 @@
 package modulereader
 
 import (
+	"errors"
 	"fmt"
 	"hpc-toolkit/pkg/logging"
 	"hpc-toolkit/pkg/sourcereader"
@@ -136,7 +137,7 @@ func ReadHclAttributes(file string) (map[string]cty.Value, error) {
 		// work around ugly <nil> in error message missing d.Subject
 		// https://github.com/hashicorp/hcl2/blob/fb75b3253c80b3bc7ca99c4bfa2ad6743841b1af/hcl/diagnostic.go#L76-L78
 		if len(diags) == 1 {
-			return nil, fmt.Errorf("%s", diags[0].Detail)
+			return nil, errors.New(diags[0].Detail)
 		}
 		return nil, diags
 	}

From 5a11dd4670bbd9fe4abc28f4c36ac5a72c1fa7d9 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Fri, 20 Dec 2024 04:41:25 +0000
Subject: [PATCH 065/140] Replace duplicated `gpu_definition.tf` by shared
 internal module

NOTE: keep ones in `schedmd-slurm-gcp-v5*` due to soon removal.
---
 .../compute/htcondor-execute-point/README.md  |  1 +
 .../compute/htcondor-execute-point/main.tf    |  9 +++
 .../README.md                                 |  1 +
 .../gpu_definition.tf                         | 58 -------------------
 .../main.tf                                   |  9 +++
 .../schedmd-slurm-gcp-v6-nodeset/README.md    |  4 +-
 .../gpu_definition.tf                         | 58 -------------------
 .../schedmd-slurm-gcp-v6-nodeset/main.tf      |  9 +++
 .../schedmd-slurm-gcp-v6-controller/README.md |  1 +
 .../controller.tf                             |  9 ++-
 .../gpu_definition.tf                         | 58 -------------------
 .../schedmd-slurm-gcp-v6-login/README.md      |  4 +-
 .../gpu_definition.tf                         | 58 -------------------
 .../schedmd-slurm-gcp-v6-login/main.tf        |  9 +++
 modules/compute/gke-node-pool/README.md       |  1 +
 .../compute/gke-node-pool/gpu_definition.tf   | 58 -------------------
 modules/compute/gke-node-pool/main.tf         |  9 +++
 .../gke-node-pool/reservation_definitions.tf  |  1 +
 modules/compute/vm-instance/README.md         |  1 +
 modules/compute/vm-instance/gpu_definition.tf | 58 -------------------
 modules/compute/vm-instance/main.tf           |  9 +++
 modules/internal/gpu-definition/README.md     | 47 +++++++++++++++
 .../internal/gpu-definition/main.tf           | 33 +++++++++--
 tools/duplicate-diff.py                       |  7 ---
 24 files changed, 150 insertions(+), 362 deletions(-)
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
 delete mode 100644 modules/compute/gke-node-pool/gpu_definition.tf
 delete mode 100644 modules/compute/vm-instance/gpu_definition.tf
 create mode 100644 modules/internal/gpu-definition/README.md
 rename community/modules/compute/htcondor-execute-point/gpu_definition.tf => modules/internal/gpu-definition/main.tf (75%)

diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md
index c7068a4522..fe1d49c4d8 100644
--- a/community/modules/compute/htcondor-execute-point/README.md
+++ b/community/modules/compute/htcondor-execute-point/README.md
@@ -211,6 +211,7 @@ limitations under the License.
 | Name | Source | Version |
 |------|--------|---------|
 | <a name="module_execute_point_instance_template"></a> [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 |
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 | <a name="module_mig"></a> [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 |
 | <a name="module_startup_script"></a> [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a |
 
diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf
index 0d8171092a..fb875f01e4 100644
--- a/community/modules/compute/htcondor-execute-point/main.tf
+++ b/community/modules/compute/htcondor-execute-point/main.tf
@@ -19,7 +19,16 @@ locals {
   labels = merge(var.labels, { ghpc_module = "htcondor-execute-point", ghpc_role = "compute" })
 }
 
+module "gpu" {
+  source = "../../../../modules/internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   zones                    = coalescelist(var.zones, data.google_compute_zones.available.names)
   network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) }
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
index 50f0cbc6e0..755ded9f61 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
@@ -74,6 +74,7 @@ modules. For support with the underlying modules, see the instructions in the
 
 | Name | Source | Version |
 |------|--------|---------|
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 | <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp/instance_template | n/a |
 
 ## Resources
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
index a528978760..c3235c0229 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
@@ -17,7 +17,16 @@ locals {
   labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset-dynamic", ghpc_role = "compute" })
 }
 
+module "gpu" {
+  source = "../../../../modules/internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   nodeset_name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14)
   feature      = coalesce(var.feature, local.nodeset_name)
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
index 297c40bb7a..ce82c34172 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
@@ -142,7 +142,9 @@ modules. For support with the underlying modules, see the instructions in the
 
 ## Modules
 
-No modules.
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 
 ## Resources
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
index 84cb60457a..c0a99f99bb 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
@@ -17,7 +17,16 @@ locals {
   labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" })
 }
 
+module "gpu" {
+  source = "../../../../modules/internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
 
   metadata = merge(
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index b03fbf0973..99078dbcce 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -259,6 +259,7 @@ limitations under the License.
 |------|--------|---------|
 | <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 |
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a |
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
 | <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
 | <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | ../../internal/slurm-gcp/instance_template | n/a |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
index c98813a722..879509f693 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
@@ -12,6 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+module "gpu" {
+  source = "../../../../modules/internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
   additional_disks = [
     for ad in var.additional_disks : {
@@ -67,7 +74,7 @@ module "slurm_controller_template" {
   enable_shielded_vm       = var.enable_shielded_vm
   shielded_instance_config = var.shielded_instance_config
 
-  gpu = one(local.guest_accelerator)
+  gpu = one(module.gpu.guest_accelerator)
 
   machine_type     = var.machine_type
   metadata         = local.metadata
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
index 7160fbdd02..023f4d161b 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
@@ -71,7 +71,9 @@ modules. For support with the underlying modules, see the instructions in the
 
 ## Modules
 
-No modules.
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 
 ## Resources
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf
index 1632116209..6568996e75 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf
@@ -17,7 +17,16 @@ locals {
   labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-login", ghpc_role = "scheduler" })
 }
 
+module "gpu" {
+  source = "../../../../modules/internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
 
   metadata = merge(
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index d2715ff652..a1fcaa8f01 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -295,6 +295,7 @@ limitations under the License.
 
 | Name | Source | Version |
 |------|--------|---------|
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../internal/gpu-definition | n/a |
 | <a name="module_kubectl_apply"></a> [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a |
 
 ## Resources
diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/modules/compute/gke-node-pool/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f1999cbd0b..c91c791393 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -27,7 +27,16 @@ locals {
   }
 }
 
+module "gpu" {
+  source = "../../internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   has_gpu                  = length(local.guest_accelerator) > 0
   allocatable_gpu_per_node = local.has_gpu ? max(local.guest_accelerator[*].count...) : -1
   gpu_taint = local.has_gpu ? [{
diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf
index 37b92a2f1a..cb24e4204c 100644
--- a/modules/compute/gke-node-pool/reservation_definitions.tf
+++ b/modules/compute/gke-node-pool/reservation_definitions.tf
@@ -48,6 +48,7 @@ data "google_compute_reservation" "specific_reservations" {
 }
 
 locals {
+  generated_guest_accelerator       = module.gpu.machine_type_guest_accelerator
   reservation_resource_api_label    = "compute.googleapis.com/reservation-name"
   input_specific_reservations_count = try(length(var.reservation_affinity.specific_reservations), 0)
 
diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md
index 8fe80e1cdc..e75b70865d 100644
--- a/modules/compute/vm-instance/README.md
+++ b/modules/compute/vm-instance/README.md
@@ -185,6 +185,7 @@ limitations under the License.
 
 | Name | Source | Version |
 |------|--------|---------|
+| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../internal/gpu-definition | n/a |
 | <a name="module_netstorage_startup_script"></a> [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | ../../scripts/startup-script | n/a |
 
 ## Resources
diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/modules/compute/vm-instance/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf
index dcb43fe91a..9b74632678 100644
--- a/modules/compute/vm-instance/main.tf
+++ b/modules/compute/vm-instance/main.tf
@@ -19,7 +19,16 @@ locals {
   labels = merge(var.labels, { ghpc_module = "vm-instance", ghpc_role = "compute" })
 }
 
+module "gpu" {
+  source = "../../internal/gpu-definition"
+
+  machine_type      = var.machine_type
+  guest_accelerator = var.guest_accelerator
+}
+
 locals {
+  guest_accelerator = module.gpu.guest_accelerator
+
   native_fstype = []
   startup_script = local.startup_from_network_storage != null ? (
   { startup-script = local.startup_from_network_storage }) : {}
diff --git a/modules/internal/gpu-definition/README.md b/modules/internal/gpu-definition/README.md
new file mode 100644
index 0000000000..29a87cab78
--- /dev/null
+++ b/modules/internal/gpu-definition/README.md
@@ -0,0 +1,47 @@
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
+
+## Providers
+
+No providers.
+
+## Modules
+
+No modules.
+
+## Resources
+
+No resources.
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
+| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_guest_accelerator"></a> [guest\_accelerator](#output\_guest\_accelerator) | Sanitized list of the type and count of accelerator cards attached to the instance. |
+| <a name="output_machine_type_guest_accelerator"></a> [machine\_type\_guest\_accelerator](#output\_machine\_type\_guest\_accelerator) | List of the type and count of accelerator cards attached to the specified machine type. |
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/modules/internal/gpu-definition/main.tf
similarity index 75%
rename from community/modules/compute/htcondor-execute-point/gpu_definition.tf
rename to modules/internal/gpu-definition/main.tf
index 1c84a92721..bc66442e5e 100644
--- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf
+++ b/modules/internal/gpu-definition/main.tf
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,20 @@
  * limitations under the License.
 */
 
-## Required variables:
-#  guest_accelerator
-#  machine_type
+variable "machine_type" {
+  description = "Machine type to use for the instance creation"
+  type        = string
+}
+
+variable "guest_accelerator" {
+  description = "List of the type and count of accelerator cards attached to the instance."
+  type = list(object({
+    type  = string,
+    count = number
+  }))
+  default  = []
+  nullable = false
+}
 
 locals {
   # example state; terraform will ignore diffs if last element of URL matches
@@ -56,3 +67,17 @@ locals {
   # (3) default to empty list if both are empty
   guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
 }
+
+output "guest_accelerator" {
+  description = "Sanitized list of the type and count of accelerator cards attached to the instance."
+  value       = local.guest_accelerator
+}
+
+output "machine_type_guest_accelerator" {
+  description = "List of the type and count of accelerator cards attached to the specified machine type."
+  value       = local.generated_guest_accelerator
+}
+
+terraform {
+  required_version = ">= 1.3"
+}
diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py
index 703f00ff95..5a7c83ac1e 100644
--- a/tools/duplicate-diff.py
+++ b/tools/duplicate-diff.py
@@ -36,16 +36,9 @@
         "modules/compute/vm-instance/startup_from_network_storage.tf",
     ],
     [
-        "modules/compute/vm-instance/gpu_definition.tf",
-        "community/modules/compute/htcondor-execute-point/gpu_definition.tf",
         "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf",
         "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf",
         "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf",
-        "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf",
-        "community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf",
-        "modules/compute/gke-node-pool/gpu_definition.tf",
     ],
     [
         "modules/compute/gke-node-pool/threads_per_core_calc.tf",

From 7c686ca0da681256f6f7ec7cc682ba7a1eca730e Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Thu, 19 Dec 2024 20:07:11 +0000
Subject: [PATCH 066/140] A3 Ultra integration tests

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 34 +++++------
 .../daily-tests/builds/gke-a3-ultragpu.yaml   | 60 +++++++++++++++++++
 .../daily-tests/tests/gke-a3-ultragpu.yml     | 39 ++++++++++++
 3 files changed, 116 insertions(+), 17 deletions(-)
 create mode 100644 tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 2eb10b679c..d15f579cb7 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -33,6 +33,22 @@ vars:
   system_node_pool_disk_size_gb: 200
   a3ultra_node_pool_disk_size_gb: 100
 
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 6.13.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+  google-beta:
+    source: hashicorp/google-beta
+    version: 6.13.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
 deployment_groups:
 - group: primary
   modules:
@@ -171,7 +187,7 @@ deployment_groups:
     use: [a3-ultragpu-cluster]
 
   - id: workload-manager-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=8c26d4a
     use: [a3-ultragpu-cluster]
     settings:
       kueue:
@@ -194,19 +210,3 @@ deployment_groups:
       node_count: 2
       name: run-nvidia-smi
     outputs: [instructions]
-
-terraform_providers:
-  google:
-    source: hashicorp/google
-    version: 6.13.0
-    configuration:
-      project: $(vars.project_id)
-      region: $(vars.region)
-      zone: $(vars.zone)
-  google-beta:
-    source: hashicorp/google-beta
-    version: 6.13.0
-    configuration:
-      project: $(vars.project_id)
-      region: $(vars.region)
-      zone: $(vars.zone)
diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
new file mode 100644
index 0000000000..c0a3cbb196
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
@@ -0,0 +1,60 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.gke-job-template
+- gke
+
+
+timeout: 14400s  # 4hr
+steps:
+- id: gke-a3-ultragpu
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    EXAMPLE_BP=examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+
+
+    # adding vm to act as remote node
+    echo '  - id: remote-node'                           >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/vm-instance'       >> $${EXAMPLE_BP}
+    echo '    use: [gke-a3-ultra-net-0]'                           >> $${EXAMPLE_BP}
+    echo '    settings:'                                 >> $${EXAMPLE_BP}
+    echo '      machine_type: e2-standard-2'             >> $${EXAMPLE_BP}
+    echo '      name_prefix: remote-node'                >> $${EXAMPLE_BP}
+    echo '      add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
+    echo ''
+    echo '  - id: job_template_hostname'                       >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/gke-job-template'        >> $${EXAMPLE_BP}
+    echo '    use: [a3-ultragpu-pool]'                          >> $${EXAMPLE_BP}
+    echo '    settings:'                                       >> $${EXAMPLE_BP}
+    echo '      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
+    echo '      command:'                                      >> $${EXAMPLE_BP}
+    echo '      - nvidia-smi'                                  >> $${EXAMPLE_BP}
+    echo '      node_count: 1'                                 >> $${EXAMPLE_BP}
+    echo '    outputs: [instructions]'                         >> $${EXAMPLE_BP}
+
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+        --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+        --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
new file mode 100644
index 0000000000..bb13b25d5c
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
@@ -0,0 +1,39 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# region, zone must be defined
+# in build file with --extra-vars flag!
+test_name: gke-a3ultra
+deployment_name: gke-a3ultra-{{ build }}
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml"
+network: gke-a3-ultra-net-0
+region: europe-west1
+zone: europe-west1-b
+remote_node: "{{ deployment_name }}-remote-node-0"
+extended_reservation: slurm-dev-gcp-a3u-gsc
+static_node_count: 1
+cli_deployment_vars:
+  region: "{{ region }}"
+  zone: "{{ zone }}"
+  static_node_count: "{{ static_node_count }}"
+  extended_reservation: "{{ extended_reservation }}"
+  authorized_cidr: "{{ build_ip.stdout }}/32"
+  gcp_public_cidrs_access_enabled: true
+custom_vars:
+  project: "{{ project }}"
+post_deploy_tests:
+- test-validation/test-gke-job.yml

From 72ac71ec3cca723bbfdb5465a5ade8ba4454ce98 Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Sat, 21 Dec 2024 21:11:51 +0000
Subject: [PATCH 067/140] Update A3U template design

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 35 ++++++++++---------
 .../daily-tests/builds/gke-a3-ultragpu.yaml   |  7 +++-
 .../daily-tests/tests/gke-a3-ultragpu.yml     |  2 +-
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index d15f579cb7..3037132c21 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -53,21 +53,22 @@ deployment_groups:
 - group: primary
   modules:
   - id: gke-a3-ultra-net-0
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
+    source: modules/network/vpc
     settings:
-      network_name: gke-a3-ultra-net-0
+      network_name: $(vars.deployment_name)-net-0
       subnetworks:
-      - subnet_name: gke-a3-ultra-sub-0
+      - subnet_name: $(vars.deployment_name)-sub-0
         subnet_region: $(vars.region)
         subnet_ip: 192.168.0.0/18
-      secondary_ranges:
-        gke-a3-ultra-sub-0:
+      secondary_ranges_list:
+      - subnetwork_name: $(vars.deployment_name)-sub-0
+        ranges:
         - range_name: pods
           ip_cidr_range: 10.4.0.0/14
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
       firewall_rules:
-      - name: gke-a3-ultra-internal-0
+      - name: $(vars.deployment_name)-internal-0
         ranges: [192.168.0.0/16]
         allow:
         - protocol: tcp
@@ -77,16 +78,16 @@ deployment_groups:
         - protocol: icmp
 
   - id: gke-a3-ultra-net-1
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
+    source: modules/network/vpc
     settings:
-      network_name: gke-a3-ultra-net-1
+      network_name: $(vars.deployment_name)-net-1
       mtu: $(vars.mtu_size)
       subnetworks:
-      - subnet_name: gke-a3-ultra-sub-1
+      - subnet_name: $(vars.deployment_name)-sub-1
         subnet_region: $(vars.region)
         subnet_ip: 192.168.64.0/18
       firewall_rules:
-      - name: gke-a3-ultra-internal-1
+      - name: $(vars.deployment_name)-internal-1
         ranges: [192.168.0.0/16]
         allow:
         - protocol: tcp
@@ -96,20 +97,20 @@ deployment_groups:
         - protocol: icmp
 
   - id: gke-a3-ultra-rdma-net
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe
+    source: modules/network/gpu-rdma-vpc
     settings:
-      network_name: gke-a3-ultra-rdma-net
+      network_name: $(vars.deployment_name)-rdma-net
       mtu: $(vars.mtu_size)
       network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
       network_routing_mode: REGIONAL
       subnetworks_template:
-        name_prefix: gke-a3-ultra-rdma-sub
+        name_prefix: $(vars.deployment_name)-rdma-sub
         count: 8
         ip_range: 192.168.128.0/18
         region: $(vars.region)
 
   - id: a3-ultragpu-cluster
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b
+    source: modules/scheduler/gke-cluster
     use: [gke-a3-ultra-net-0]
     settings:
       release_channel: RAPID
@@ -146,7 +147,7 @@ deployment_groups:
     outputs: [instructions]
 
   - id: a3-ultragpu-pool
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b
+    source: modules/compute/gke-node-pool
     use: [a3-ultragpu-cluster]
     settings:
       machine_type: a3-ultragpu-8g
@@ -183,11 +184,11 @@ deployment_groups:
     outputs: [instructions]
 
   - id: topology-aware-scheduler-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b
+    source: community/modules/compute/gke-topology-scheduler
     use: [a3-ultragpu-cluster]
 
   - id: workload-manager-install
-    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=8c26d4a
+    source: modules/management/kubectl-apply
     use: [a3-ultragpu-cluster]
     settings:
       kueue:
diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
index c0a3cbb196..c8ffdb136e 100644
--- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
@@ -16,7 +16,12 @@
 tags:
 - m.gke-job-template
 - gke
-
+- m.gke-cluster
+- m.gke-node-pool
+- m.gke-topology-scheduler
+- m.gpu-rdma-vpc
+- m.kubectl-apply
+- m.vpc
 
 timeout: 14400s  # 4hr
 steps:
diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
index bb13b25d5c..a1dd8c72f4 100644
--- a/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
+++ b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
@@ -20,7 +20,7 @@ test_name: gke-a3ultra
 deployment_name: gke-a3ultra-{{ build }}
 workspace: /workspace
 blueprint_yaml: "{{ workspace }}/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml"
-network: gke-a3-ultra-net-0
+network: "{{ deployment_name }}-net-0"
 region: europe-west1
 zone: europe-west1-b
 remote_node: "{{ deployment_name }}-remote-node-0"

From 9ac1bc52eac436be810e017e8c2cfaf55600b651 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Sun, 22 Dec 2024 09:57:27 -0700
Subject: [PATCH 068/140] Update mount-daos.sh

Fix network interfaces for ubuntu, similar to debian.
---
 .../pre-existing-network-storage/scripts/mount-daos.sh          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
index 50ac2b273c..a6a133b05d 100644
--- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
+++ b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
@@ -54,7 +54,7 @@ sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config
 # Get names of network interfaces not in first PCI slot
 # The first PCI slot is a standard network adapter while remaining interfaces
 # are typically network cards dedicated to GPU or workload communication
-if [[ "$OS_ID" == "debian" ]]; then
+if [[ "$OS_ID" == "debian" ]] || [[ "${OS_ID}" = "ubuntu" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
 elif [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')

From b39e07f979acacbc81fbe7601599f6693c43b695 Mon Sep 17 00:00:00 2001
From: Sam Skillman <samskillman@google.com>
Date: Mon, 23 Dec 2024 01:49:45 +0000
Subject: [PATCH 069/140] Add to parallelstore scripts

---
 modules/file-system/parallelstore/scripts/mount-daos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh
index 50ac2b273c..a6a133b05d 100644
--- a/modules/file-system/parallelstore/scripts/mount-daos.sh
+++ b/modules/file-system/parallelstore/scripts/mount-daos.sh
@@ -54,7 +54,7 @@ sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config
 # Get names of network interfaces not in first PCI slot
 # The first PCI slot is a standard network adapter while remaining interfaces
 # are typically network cards dedicated to GPU or workload communication
-if [[ "$OS_ID" == "debian" ]]; then
+if [[ "$OS_ID" == "debian" ]] || [[ "${OS_ID}" = "ubuntu" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
 elif [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')

From 72314e77eb8f08dda67f00d0177bec555ba28682 Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Thu, 19 Dec 2024 03:37:12 +0000
Subject: [PATCH 070/140] Update python test deployment name to build id

---
 .../daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml |  2 ++
 .../builds/slurm-gcp-v6-simple-job-completion.yaml     |  2 ++
 .../daily-tests/builds/slurm-gcp-v6-topology.yaml      |  2 ++
 .../blueprints/slurm-simple-reconfig.yaml              |  2 +-
 .../blueprints/slurm-simple.yaml                       |  2 +-
 .../blueprints/topology-test.yaml                      |  2 +-
 tools/python-integration-tests/deployment.py           | 10 +++++++++-
 7 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
index 8d6e390ebe..d67fd64e1e 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
@@ -31,4 +31,6 @@ steps:
   - |
     set -x -e
     cd /workspace && make
+    export BUILD_ID="${BUILD_ID}"
+
     python3 tools/python-integration-tests/slurm_reconfig_size.py
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
index 7acd7bdc11..7a8a8f3a26 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
@@ -31,4 +31,6 @@ steps:
   - |
     set -x -e
     cd /workspace && make
+    export BUILD_ID="${BUILD_ID}"
+
     python3 tools/python-integration-tests/slurm_simple_job_completion.py
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-topology.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-topology.yaml
index 51bfa17c71..f96bd876fc 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-topology.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-topology.yaml
@@ -31,4 +31,6 @@ steps:
   - |
     set -x -e
     cd /workspace && make
+    export BUILD_ID="${BUILD_ID}"
+
     python3 tools/python-integration-tests/slurm_topology.py
diff --git a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml b/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
index a9ac6d891f..a014c03c0f 100644
--- a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
+++ b/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
@@ -17,7 +17,7 @@ blueprint_name: slurm-test
 
 vars:
   project_id: ## Set GCP Project ID Here ##
-  deployment_name: slurm-test
+  deployment_name: ## Set Deployment Name Here ##
   region: us-central1
   zone: us-central1-a
 
diff --git a/tools/python-integration-tests/blueprints/slurm-simple.yaml b/tools/python-integration-tests/blueprints/slurm-simple.yaml
index 235674c4d2..b0451f3cc9 100644
--- a/tools/python-integration-tests/blueprints/slurm-simple.yaml
+++ b/tools/python-integration-tests/blueprints/slurm-simple.yaml
@@ -17,7 +17,7 @@ blueprint_name: slurm-test
 
 vars:
   project_id: ## Set GCP Project ID Here ##
-  deployment_name: slurm-test
+  deployment_name: ## Set Deployment Name Here ##
   region: us-central1
   zone: us-central1-a
 
diff --git a/tools/python-integration-tests/blueprints/topology-test.yaml b/tools/python-integration-tests/blueprints/topology-test.yaml
index acb494c801..0dbf627e6c 100644
--- a/tools/python-integration-tests/blueprints/topology-test.yaml
+++ b/tools/python-integration-tests/blueprints/topology-test.yaml
@@ -17,7 +17,7 @@ blueprint_name: topology-test
 
 vars:
   project_id: ## Set GCP Project ID Here ##
-  deployment_name: topology-test
+  deployment_name: ## Set Deployment Name Here ##
   region: us-central1
   zone: us-central1-a
 
diff --git a/tools/python-integration-tests/deployment.py b/tools/python-integration-tests/deployment.py
index 3ed43361b9..c0ca562656 100644
--- a/tools/python-integration-tests/deployment.py
+++ b/tools/python-integration-tests/deployment.py
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import yaml
+import uuid
 
 class Deployment:
     def __init__(self, blueprint: str):
@@ -37,7 +38,6 @@ def run_command(self, cmd: str, err_msg: str = None) -> subprocess.CompletedProc
     def parse_blueprint(self, file_path: str):
         with open(file_path, 'r') as file:
             content = yaml.safe_load(file)
-        self.deployment_name = content["vars"]["deployment_name"]
         self.zone = content["vars"]["zone"]
 
     def get_posixAccount_info(self):
@@ -50,8 +50,16 @@ def get_posixAccount_info(self):
                 self.project_id = account['accountId']
                 self.username = account['username']
 
+    def generate_uniq_deployment_name(self):
+        BUILD_ID = os.environ.get('BUILD_ID')
+        if BUILD_ID:
+            return BUILD_ID[:6]
+        else:
+            return str(uuid.uuid4())[:6]
+
     def set_deployment_variables(self):
         self.workspace = os.path.abspath(os.getcwd().strip())
+        self.deployment_name = self.generate_uniq_deployment_name()
         self.parse_blueprint(self.blueprint_yaml)
         self.get_posixAccount_info()
         self.instance_name = self.deployment_name.replace("-", "")[:10] + "-slurm-login-001"

From 3f0b32db45c0b3f764913ecf6faad1c90c5c92f3 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Mon, 23 Dec 2024 02:39:27 +0000
Subject: [PATCH 071/140] Add full defintion of `nodeset` to partition module

---
 .../schedmd-slurm-gcp-v6-partition/README.md  |   2 +-
 .../variables.tf                              | 108 +++++++++++++++++-
 .../test_configs/node-groups.yaml             |  46 --------
 3 files changed, 107 insertions(+), 49 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md
index 54148697bf..00aa1198e8 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md
@@ -85,7 +85,7 @@ No resources.
 | <a name="input_exclusive"></a> [exclusive](#input\_exclusive) | Exclusive job access to nodes. When set to true nodes execute single job and are deleted<br/>after job exits. If set to false, multiple jobs can be scheduled on one node. | `bool` | `true` | no |
 | <a name="input_is_default"></a> [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.<br/>If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | DEPRECATED | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | A list of nodesets.<br/>For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | `list(any)` | `[]` | no |
+| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | A list of nodesets.<br/>For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 1)<br/>    node_conf              = optional(map(string), {})<br/>    nodeset_name           = string<br/>    additional_disks = optional(list(object({<br/>      disk_name    = optional(string)<br/>      device_name  = optional(string)<br/>      disk_size_gb = optional(number)<br/>      disk_type    = optional(string)<br/>      disk_labels  = optional(map(string), {})<br/>      auto_delete  = optional(bool, true)<br/>      boot         = optional(bool, false)<br/>    })), [])<br/>    bandwidth_tier                   = optional(string, "platform_default")<br/>    can_ip_forward                   = optional(bool, false)<br/>    disable_smt                      = optional(bool, false)<br/>    disk_auto_delete                 = optional(bool, true)<br/>    disk_labels                      = optional(map(string), {})<br/>    disk_size_gb                     = optional(number)<br/>    disk_type                        = optional(string)<br/>    enable_confidential_vm           = optional(bool, false)<br/>    enable_placement                 = optional(bool, false)<br/>    enable_oslogin                   = optional(bool, true)<br/>    enable_shielded_vm               = optional(bool, false)<br/>    enable_maintenance_reservation   = optional(bool, false)<br/>    enable_opportunistic_maintenance = optional(bool, false)<br/>    gpu = optional(object({<br/>      count = number<br/>      type  = string<br/>    }))<br/>    dws_flex = object({<br/>      enabled          = bool<br/>      max_run_duration = number<br/>      use_job_duration = bool<br/>    })<br/>    labels                   = optional(map(string), {})<br/>    machine_type             = optional(string)<br/>    maintenance_interval     = optional(string)<br/>    instance_properties_json = string<br/>    metadata                 = optional(map(string), {})<br/>    min_cpu_platform         = optional(string)<br/>    network_tier             = optional(string, "STANDARD")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    on_host_maintenance = optional(string)<br/>    preemptible         = optional(bool, false)<br/>    region              = optional(string)<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    shielded_instance_config = optional(object({<br/>      enable_integrity_monitoring = optional(bool, true)<br/>      enable_secure_boot          = optional(bool, true)<br/>      enable_vtpm                 = optional(bool, true)<br/>    }))<br/>    source_image_family  = optional(string)<br/>    source_image_project = optional(string)<br/>    source_image         = optional(string)<br/>    subnetwork_self_link = string<br/>    additional_networks = optional(list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    })))<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })))<br/>    spot               = optional(bool, false)<br/>    tags               = optional(list(string), [])<br/>    termination_action = optional(string)<br/>    reservation_name   = optional(string)<br/>    future_reservation = string<br/>    startup_script = optional(list(object({<br/>      filename = string<br/>    content = string })), [])<br/><br/>    zone_target_shape = string<br/>    zone_policy_allow = set(string)<br/>    zone_policy_deny  = set(string)<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_dyn"></a> [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. | <pre>list(object({<br/>    nodeset_name    = string<br/>    nodeset_feature = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_tpu"></a> [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 5)<br/>    nodeset_name           = string<br/>    enable_public_ip       = optional(bool, false)<br/>    node_type              = string<br/>    accelerator_config = optional(object({<br/>      topology = string<br/>      version  = string<br/>      }), {<br/>      topology = ""<br/>      version  = ""<br/>    })<br/>    tf_version   = string<br/>    preemptible  = optional(bool, false)<br/>    preserve_tpu = optional(bool, false)<br/>    zone         = string<br/>    data_disks   = optional(list(string), [])<br/>    docker_image = optional(string, "")<br/>    network_storage = optional(list(object({<br/>      server_ip     = string<br/>      remote_mount  = string<br/>      local_mount   = string<br/>      fs_type       = string<br/>      mount_options = string<br/>    })), [])<br/>    subnetwork = string<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    project_id = string<br/>    reserved   = optional(string, false)<br/>  }))</pre> | `[]` | no |
 | <a name="input_partition_conf"></a> [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.<br/>See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf
index e14e44b02a..5798e68878 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf
@@ -54,8 +54,112 @@ variable "nodeset" {
   A list of nodesets.
   For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset
   EOD
-  type        = list(any)
-  default     = []
+  type = list(object({
+    node_count_static      = optional(number, 0)
+    node_count_dynamic_max = optional(number, 1)
+    node_conf              = optional(map(string), {})
+    nodeset_name           = string
+    additional_disks = optional(list(object({
+      disk_name    = optional(string)
+      device_name  = optional(string)
+      disk_size_gb = optional(number)
+      disk_type    = optional(string)
+      disk_labels  = optional(map(string), {})
+      auto_delete  = optional(bool, true)
+      boot         = optional(bool, false)
+    })), [])
+    bandwidth_tier                   = optional(string, "platform_default")
+    can_ip_forward                   = optional(bool, false)
+    disable_smt                      = optional(bool, false)
+    disk_auto_delete                 = optional(bool, true)
+    disk_labels                      = optional(map(string), {})
+    disk_size_gb                     = optional(number)
+    disk_type                        = optional(string)
+    enable_confidential_vm           = optional(bool, false)
+    enable_placement                 = optional(bool, false)
+    enable_oslogin                   = optional(bool, true)
+    enable_shielded_vm               = optional(bool, false)
+    enable_maintenance_reservation   = optional(bool, false)
+    enable_opportunistic_maintenance = optional(bool, false)
+    gpu = optional(object({
+      count = number
+      type  = string
+    }))
+    dws_flex = object({
+      enabled          = bool
+      max_run_duration = number
+      use_job_duration = bool
+    })
+    labels                   = optional(map(string), {})
+    machine_type             = optional(string)
+    maintenance_interval     = optional(string)
+    instance_properties_json = string
+    metadata                 = optional(map(string), {})
+    min_cpu_platform         = optional(string)
+    network_tier             = optional(string, "STANDARD")
+    network_storage = optional(list(object({
+      server_ip             = string
+      remote_mount          = string
+      local_mount           = string
+      fs_type               = string
+      mount_options         = string
+      client_install_runner = optional(map(string))
+      mount_runner          = optional(map(string))
+    })), [])
+    on_host_maintenance = optional(string)
+    preemptible         = optional(bool, false)
+    region              = optional(string)
+    service_account = optional(object({
+      email  = optional(string)
+      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
+    }))
+    shielded_instance_config = optional(object({
+      enable_integrity_monitoring = optional(bool, true)
+      enable_secure_boot          = optional(bool, true)
+      enable_vtpm                 = optional(bool, true)
+    }))
+    source_image_family  = optional(string)
+    source_image_project = optional(string)
+    source_image         = optional(string)
+    subnetwork_self_link = string
+    additional_networks = optional(list(object({
+      network            = string
+      subnetwork         = string
+      subnetwork_project = string
+      network_ip         = string
+      nic_type           = string
+      stack_type         = string
+      queue_count        = number
+      access_config = list(object({
+        nat_ip       = string
+        network_tier = string
+      }))
+      ipv6_access_config = list(object({
+        network_tier = string
+      }))
+      alias_ip_range = list(object({
+        ip_cidr_range         = string
+        subnetwork_range_name = string
+      }))
+    })))
+    access_config = optional(list(object({
+      nat_ip       = string
+      network_tier = string
+    })))
+    spot               = optional(bool, false)
+    tags               = optional(list(string), [])
+    termination_action = optional(string)
+    reservation_name   = optional(string)
+    future_reservation = string
+    startup_script = optional(list(object({
+      filename = string
+    content = string })), [])
+
+    zone_target_shape = string
+    zone_policy_allow = set(string)
+    zone_policy_deny  = set(string)
+  }))
+  default = []
 
   validation {
     condition     = length(distinct(var.nodeset[*].nodeset_name)) == length(var.nodeset)
diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml
index 962d1e3130..ef512993c7 100644
--- a/tools/validate_configs/test_configs/node-groups.yaml
+++ b/tools/validate_configs/test_configs/node-groups.yaml
@@ -107,51 +107,6 @@ deployment_groups:
     settings:
       partition_name: multns
 
-  ## Explicitly set node partition with one nodeset
-  - id: single_nodeset_explicit_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
-    settings:
-      partition_name: explns
-      is_default: true
-      nodeset:
-      - nodeset_name: expl
-        node_count_static: 0
-        node_count_dynamic_max: 4
-        enable_placement: false
-        node_conf: {}
-        additional_disks: []
-        additional_networks: []
-        bandwidth_tier: null
-        can_ip_forward: false
-        enable_smt: true
-        disk_auto_delete: true
-        disk_labels: {}
-        disk_size_gb: 50
-        disk_type: pd-standard
-        enable_confidential_vm: false
-        enable_oslogin: true
-        enable_shielded_vm: false
-        enable_spot_vm: false
-        gpu: null
-        instance_template: null
-        labels: $(vars.labels)
-        machine_type: n2-standard-16
-        maintenance_interval: ""
-        metadata: {}
-        min_cpu_platform: null
-        on_host_maintenance: TERMINATE
-        preemptible: false
-        reservation_name: null # will be replaced by default value empty string
-        service_account_email: null
-        shielded_instance_config: null
-        subnetwork_self_link: $(network.subnetwork_self_link)
-        spot_instance_config: null
-        source_image_family: null
-        source_image_project: null
-        source_image: null
-        tags: []
-        access_config: []
-
   - id: slurm_login
     source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
     use: [network]
@@ -165,7 +120,6 @@ deployment_groups:
     - network
     - single_nodeset_partition
     - multiple_nodesets
-    - single_nodeset_explicit_partition
     - homefs
     - slurm_login
     settings:

From d76cfe2d716550aa3834a85229e0ba359acda4a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Thu, 17 Oct 2024 09:19:49 +0000
Subject: [PATCH 072/140] Use templates for DAOS mounts

---
 modules/file-system/parallelstore/README.md   | 26 +++++++
 modules/file-system/parallelstore/main.tf     | 13 ++--
 .../scripts/install-daos-client.sh            |  1 +
 .../mount-daos.sh.tftpl}                      | 70 ++++++++-----------
 .../file-system/parallelstore/variables.tf    | 14 ++++
 .../pre-existing-network-storage/README.md    | 33 +++++++++
 .../pre-existing-network-storage/outputs.tf   | 12 +++-
 .../scripts/install-daos-client.sh            |  1 +
 .../mount-daos.sh.tftpl}                      | 70 ++++++++-----------
 .../pre-existing-network-storage/variables.tf | 10 +++
 .../templates/startup-script-custom.tftpl     |  2 +-
 tools/duplicate-diff.py                       |  4 +-
 12 files changed, 168 insertions(+), 88 deletions(-)
 rename modules/file-system/parallelstore/{scripts/mount-daos.sh => templates/mount-daos.sh.tftpl} (55%)
 rename modules/file-system/pre-existing-network-storage/{scripts/mount-daos.sh => templates/mount-daos.sh.tftpl} (55%)

diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index 25c84eeaac..4a5896e217 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -94,6 +94,30 @@ Here you can replace `import_gcs_bucket_uri` with the uri of sub folder within G
 bucket and `import_destination_path` with local directory within parallelstore
 instance.
 
+### Additional configuration for DAOS agent and dfuse
+Use `daos_agent_config` to provide additional configuration for `daos_agent`, for example:
+
+```yaml
+- id: parallelstorefs
+  source: modules/file-system/pre-existing-network-storage
+  settings:
+    daos_agent_config: |
+      credential_config:
+        cache_expiration: 1m
+```
+
+Use `dfuse_environment` to provide additional environment variables for `dfuse` process, for example:
+
+```yaml
+- id: parallelstorefs
+  source: modules/file-system/parallelstore
+  settings:
+    dfuse_environment:
+      D_LOG_FILE: /tmp/client.log
+      D_APPEND_PID_TO_LOG: 1
+      D_LOG_MASK: debug
+```
+
 <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 Copyright 2024 Google LLC
 
@@ -142,7 +166,9 @@ No modules.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
+| <a name="input_daos_agent_config"></a> [daos\_agent\_config](#input\_daos\_agent\_config) | Additional configuration to be added to daos\_config.yml | `string` | `""` | no |
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes |
+| <a name="input_dfuse_environment"></a> [dfuse\_environment](#input\_dfuse\_environment) | Additional environment variables for DFuse process | `map(string)` | `{}` | no |
 | <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | The parallelstore stripe level for directories. | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
 | <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | The parallelstore stripe level for files. | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
 | <a name="input_import_destination_path"></a> [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no |
diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index c09de17a2e..acc2a0551e 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -34,10 +34,15 @@ locals {
   }
 
   mount_runner = {
-    "type"        = "shell"
-    "source"      = "${path.module}/scripts/mount-daos.sh"
-    "args"        = "--access_points=\"${local.access_points}\" --local_mount=\"${var.local_mount}\" --mount_options=\"${var.mount_options}\""
-    "destination" = "mount_daos.sh"
+    "type" = "shell"
+    "content" = templatefile("${path.module}/templates/mount-daos.sh.tftpl", {
+      access_points     = local.access_points
+      daos_agent_config = var.daos_agent_config
+      dfuse_environment = var.dfuse_environment
+      local_mount       = var.local_mount
+      mount_options     = join(" ", [for opt in split(",", var.mount_options) : "--${opt}"])
+    })
+    "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh"
   }
 }
 
diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh
index 22ec324af7..e96eadb56a 100644
--- a/modules/file-system/parallelstore/scripts/install-daos-client.sh
+++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh
@@ -50,6 +50,7 @@ else
 		if [ -x /usr/bin/google_disable_automatic_updates ]; then
 			/usr/bin/google_disable_automatic_updates
 		fi
+		dnf clean all
 		dnf makecache
 
 		# 2) Install daos-client
diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/templates/mount-daos.sh.tftpl
similarity index 55%
rename from modules/file-system/parallelstore/scripts/mount-daos.sh
rename to modules/file-system/parallelstore/templates/mount-daos.sh.tftpl
index a6a133b05d..c6f5d53660 100644
--- a/modules/file-system/parallelstore/scripts/mount-daos.sh
+++ b/modules/file-system/parallelstore/templates/mount-daos.sh.tftpl
@@ -20,59 +20,48 @@ OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"/
 OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//')
 
 if ! {
-	{ [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; } && { [[ "${OS_VERSION_MAJOR}" = "8" ]] || [[ "${OS_VERSION_MAJOR}" = "9" ]]; } ||
-		{ [[ "${OS_ID}" = "ubuntu" ]] && [[ "${OS_VERSION}" = "22.04" ]]; } ||
-		{ [[ "${OS_ID}" = "debian" ]] && [[ "${OS_VERSION_MAJOR}" = "12" ]]; }
+	{ [[ "$${OS_ID}" = "rocky" ]] || [[ "$${OS_ID}" = "rhel" ]]; } && { [[ "$${OS_VERSION_MAJOR}" = "8" ]] || [[ "$${OS_VERSION_MAJOR}" = "9" ]]; } ||
+		{ [[ "$${OS_ID}" = "ubuntu" ]] && [[ "$${OS_VERSION}" = "22.04" ]]; } ||
+		{ [[ "$${OS_ID}" = "debian" ]] && [[ "$${OS_VERSION_MAJOR}" = "12" ]]; }
 }; then
-	echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12."
+	echo "Unsupported operating system $${OS_ID} $${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12."
 	exit 1
 
 fi
 
-# Parse local_mount, mount_options from argument.
-# Format mount-options string to be compatible to dfuse mount command.
-# e.g. "disable-wb-cache,eq-count=8" --> --disable-wb-cache --eq-count=8.
-for arg in "$@"; do
-	if [[ $arg == --access_points=* ]]; then
-		access_points="${arg#*=}"
-	fi
-	if [[ $arg == --local_mount=* ]]; then
-		local_mount="${arg#*=}"
-	fi
-	if [[ $arg == --mount_options=* ]]; then
-		mount_options="${arg#*=}"
-		mount_options="--${mount_options//,/ --}"
-	fi
-done
-
 # Edit agent config
 daos_config=/etc/daos/daos_agent.yml
-sed -i "s/#.*transport_config/transport_config/g" $daos_config
-sed -i "s/#.*allow_insecure:.*false/  allow_insecure: true/g" $daos_config
-sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config
 
+# rewrite $daos_config from scratch
+mv $${daos_config} $${daos_config}.orig
+
+exclude_fabric_ifaces=""
 # Get names of network interfaces not in first PCI slot
 # The first PCI slot is a standard network adapter while remaining interfaces
 # are typically network cards dedicated to GPU or workload communication
-if [[ "$OS_ID" == "debian" ]] || [[ "${OS_ID}" = "ubuntu" ]]; then
+if [[ "$${OS_ID}" == "debian" ]] || [[ "$${OS_ID}" = "ubuntu" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
-elif [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; then
+elif [[ "$${OS_ID}" = "rocky" ]] || [[ "$${OS_ID}" = "rhel" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')
 fi
 
-if [[ -n "$extra_interfaces" ]]; then
-	exclude_fabric_ifaces="\"lo\",$extra_interfaces"
-	sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config
-fi
+cat > $daos_config <<EOF
+access_points: ${access_points}
+log_file: /var/log/daos_agent/daos_agent.log
+transport_config:
+  allow_insecure: true
+$exclude_fabric_ifaces
+${daos_agent_config}
+
+EOF
+
 
-# reroute logs from /tmp (default) to daos_agent dedicated directory
 mkdir -p /var/log/daos_agent
 chown daos_agent:daos_agent /var/log/daos_agent
-sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config
 
 # Mount parallelstore instance to client vm.
-mkdir -p "$local_mount"
-chmod 777 "$local_mount"
+mkdir -p "${local_mount}"
+chmod 777 "${local_mount}"
 
 # Mount container for multi-user.
 fuse_config=/etc/fuse.conf
@@ -83,10 +72,10 @@ ulimit -n 1048576
 
 # Construct the service name with the local_mount suffix
 safe_mount_name=$(systemd-escape -p "${local_mount}")
-service_name="mount_parallelstore_${safe_mount_name}.service"
+service_name="mount_parallelstore_$${safe_mount_name}.service"
 
 # --- Begin: Add systemd service creation ---
-cat >/etc/systemd/system/"${service_name}" <<EOF
+cat >/etc/systemd/system/"$${service_name}" <<EOF
 [Unit]
 Description=DAOS Mount Service
 After=network-online.target daos_agent.service
@@ -100,8 +89,11 @@ User=root
 Group=root
 Restart=on-failure
 RestartSec=10
-ExecStart=/bin/dfuse -m $local_mount --pool default-pool --container default-container --multi-user $mount_options --foreground
-ExecStop=/usr/bin/fusermount3 -u $local_mount
+ExecStart=/bin/dfuse -m ${local_mount} --pool default-pool --container default-container --multi-user ${mount_options} --foreground
+ExecStop=fusermount3 -z -u '${local_mount}'
+%{ for env_key, env_value in dfuse_environment ~}
+Environment="${env_key}=${env_value}"
+%{ endfor ~}
 
 [Install]
 WantedBy=multi-user.target
@@ -111,8 +103,8 @@ EOF
 # brought in automatically by the mount units
 systemctl daemon-reload
 systemctl enable daos_agent.service
-systemctl enable "${service_name}"
-systemctl start "${service_name}"
+systemctl enable "$${service_name}"
+systemctl start "$${service_name}"
 # --- End: Add systemd service creation ---
 
 exit 0
diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf
index d2a61d6392..836f443f19 100644
--- a/modules/file-system/parallelstore/variables.tf
+++ b/modules/file-system/parallelstore/variables.tf
@@ -24,6 +24,20 @@ variable "deployment_name" {
   type        = string
 }
 
+variable "daos_agent_config" {
+  description = "Additional configuration to be added to daos_config.yml"
+  type        = string
+  default     = ""
+  nullable    = false
+}
+
+variable "dfuse_environment" {
+  description = "Additional environment variables for DFuse process"
+  type        = map(string)
+  default     = {}
+  nullable    = false
+}
+
 variable "name" {
   description = "Name of parallelstore instance."
   type        = string
diff --git a/modules/file-system/pre-existing-network-storage/README.md b/modules/file-system/pre-existing-network-storage/README.md
index a39d6f344a..bb6d4f1c6f 100644
--- a/modules/file-system/pre-existing-network-storage/README.md
+++ b/modules/file-system/pre-existing-network-storage/README.md
@@ -74,6 +74,38 @@ for `parallelstore` instance.
     mount_options: disable-wb-cache,thread-count=16,eq-count=8
 ```
 
+Parallelstore supports additional options for its mountpoints under `parallelstore_options` setting.
+Use `daos_agent_config` to provide additional configuration for `daos_agent`, for example:
+
+```yaml
+- id: parallelstorefs
+  source: modules/file-system/pre-existing-network-storage
+  settings:
+    fs_type: daos
+    remote_mount: "[10.246.99.2,10.246.99.3,10.246.99.4]"
+    mount_options: disable-wb-cache,thread-count=16,eq-count=8
+    parallelstore_options:
+      daos_agent_config: |
+        credential_config:
+          cache_expiration: 1m
+```
+
+Use `dfuse_environment` to provide additional environment variables for `dfuse` process, for example:
+
+```yaml
+- id: parallelstorefs
+  source: modules/file-system/pre-existing-network-storage
+  settings:
+    fs_type: daos
+    remote_mount: "[10.246.99.2,10.246.99.3,10.246.99.4]"
+    mount_options: disable-wb-cache,thread-count=16,eq-count=8
+    parallelstore_options:
+      dfuse_environment:
+        D_LOG_FILE: /tmp/client.log
+        D_APPEND_PID_TO_LOG: 1
+        D_LOG_MASK: debug
+```
+
 ### Mounting
 
 For the `fs_type` listed below, this module will provide `client_install_runner`
@@ -126,6 +158,7 @@ No resources.
 | <a name="input_fs_type"></a> [fs\_type](#input\_fs\_type) | Type of file system to be mounted (e.g., nfs, lustre) | `string` | `"nfs"` | no |
 | <a name="input_local_mount"></a> [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/mnt"` | no |
 | <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Options describing various aspects of the file system. Consider adding setting to 'defaults,\_netdev,implicit\_dirs' when using gcsfuse. | `string` | `"defaults,_netdev"` | no |
+| <a name="input_parallelstore_options"></a> [parallelstore\_options](#input\_parallelstore\_options) | Parallelstore specific options | <pre>object({<br/>    daos_agent_config = optional(string, "")<br/>    dfuse_environment = optional(map(string), {})<br/>  })</pre> | `{}` | no |
 | <a name="input_remote_mount"></a> [remote\_mount](#input\_remote\_mount) | Remote FS name or export. This is the exported directory for nfs, fs name for lustre, and bucket name (without gs://) for gcsfuse. | `string` | n/a | yes |
 | <a name="input_server_ip"></a> [server\_ip](#input\_server\_ip) | The device name as supplied to fs-tab, excluding remote fs-name(for nfs, that is the server IP, for lustre <MGS NID>[:<MGS NID>]). This can be omitted for gcsfuse. | `string` | `""` | no |
 
diff --git a/modules/file-system/pre-existing-network-storage/outputs.tf b/modules/file-system/pre-existing-network-storage/outputs.tf
index 9e93226804..df92f7c315 100644
--- a/modules/file-system/pre-existing-network-storage/outputs.tf
+++ b/modules/file-system/pre-existing-network-storage/outputs.tf
@@ -83,9 +83,15 @@ locals {
   }
 
   mount_runner_daos = {
-    "type"        = "shell"
-    "content"     = file("${path.module}/scripts/mount-daos.sh")
-    "args"        = "--access_points=\"${var.remote_mount}\" --local_mount=\"${var.local_mount}\" --mount_options=\"${var.mount_options}\""
+    "type" = "shell"
+    "content" = templatefile("${path.module}/templates/mount-daos.sh.tftpl", {
+      access_points     = var.remote_mount
+      daos_agent_config = var.parallelstore_options.daos_agent_config
+      dfuse_environment = var.parallelstore_options.dfuse_environment
+      local_mount       = var.local_mount
+      # avoid passing "--" as mount option to dfuse
+      mount_options = length(var.mount_options) == 0 ? "" : join(" ", [for opt in split(",", var.mount_options) : "--${opt}"])
+    })
     "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh"
   }
 
diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh
index 22ec324af7..e96eadb56a 100644
--- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh
+++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh
@@ -50,6 +50,7 @@ else
 		if [ -x /usr/bin/google_disable_automatic_updates ]; then
 			/usr/bin/google_disable_automatic_updates
 		fi
+		dnf clean all
 		dnf makecache
 
 		# 2) Install daos-client
diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/templates/mount-daos.sh.tftpl
similarity index 55%
rename from modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
rename to modules/file-system/pre-existing-network-storage/templates/mount-daos.sh.tftpl
index a6a133b05d..c6f5d53660 100644
--- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
+++ b/modules/file-system/pre-existing-network-storage/templates/mount-daos.sh.tftpl
@@ -20,59 +20,48 @@ OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"/
 OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//')
 
 if ! {
-	{ [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; } && { [[ "${OS_VERSION_MAJOR}" = "8" ]] || [[ "${OS_VERSION_MAJOR}" = "9" ]]; } ||
-		{ [[ "${OS_ID}" = "ubuntu" ]] && [[ "${OS_VERSION}" = "22.04" ]]; } ||
-		{ [[ "${OS_ID}" = "debian" ]] && [[ "${OS_VERSION_MAJOR}" = "12" ]]; }
+	{ [[ "$${OS_ID}" = "rocky" ]] || [[ "$${OS_ID}" = "rhel" ]]; } && { [[ "$${OS_VERSION_MAJOR}" = "8" ]] || [[ "$${OS_VERSION_MAJOR}" = "9" ]]; } ||
+		{ [[ "$${OS_ID}" = "ubuntu" ]] && [[ "$${OS_VERSION}" = "22.04" ]]; } ||
+		{ [[ "$${OS_ID}" = "debian" ]] && [[ "$${OS_VERSION_MAJOR}" = "12" ]]; }
 }; then
-	echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12."
+	echo "Unsupported operating system $${OS_ID} $${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12."
 	exit 1
 
 fi
 
-# Parse local_mount, mount_options from argument.
-# Format mount-options string to be compatible to dfuse mount command.
-# e.g. "disable-wb-cache,eq-count=8" --> --disable-wb-cache --eq-count=8.
-for arg in "$@"; do
-	if [[ $arg == --access_points=* ]]; then
-		access_points="${arg#*=}"
-	fi
-	if [[ $arg == --local_mount=* ]]; then
-		local_mount="${arg#*=}"
-	fi
-	if [[ $arg == --mount_options=* ]]; then
-		mount_options="${arg#*=}"
-		mount_options="--${mount_options//,/ --}"
-	fi
-done
-
 # Edit agent config
 daos_config=/etc/daos/daos_agent.yml
-sed -i "s/#.*transport_config/transport_config/g" $daos_config
-sed -i "s/#.*allow_insecure:.*false/  allow_insecure: true/g" $daos_config
-sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config
 
+# rewrite $daos_config from scratch
+mv $${daos_config} $${daos_config}.orig
+
+exclude_fabric_ifaces=""
 # Get names of network interfaces not in first PCI slot
 # The first PCI slot is a standard network adapter while remaining interfaces
 # are typically network cards dedicated to GPU or workload communication
-if [[ "$OS_ID" == "debian" ]] || [[ "${OS_ID}" = "ubuntu" ]]; then
+if [[ "$${OS_ID}" == "debian" ]] || [[ "$${OS_ID}" = "ubuntu" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
-elif [[ "${OS_ID}" = "rocky" ]] || [[ "${OS_ID}" = "rhel" ]]; then
+elif [[ "$${OS_ID}" = "rocky" ]] || [[ "$${OS_ID}" = "rhel" ]]; then
 	extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')
 fi
 
-if [[ -n "$extra_interfaces" ]]; then
-	exclude_fabric_ifaces="\"lo\",$extra_interfaces"
-	sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config
-fi
+cat > $daos_config <<EOF
+access_points: ${access_points}
+log_file: /var/log/daos_agent/daos_agent.log
+transport_config:
+  allow_insecure: true
+$exclude_fabric_ifaces
+${daos_agent_config}
+
+EOF
+
 
-# reroute logs from /tmp (default) to daos_agent dedicated directory
 mkdir -p /var/log/daos_agent
 chown daos_agent:daos_agent /var/log/daos_agent
-sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config
 
 # Mount parallelstore instance to client vm.
-mkdir -p "$local_mount"
-chmod 777 "$local_mount"
+mkdir -p "${local_mount}"
+chmod 777 "${local_mount}"
 
 # Mount container for multi-user.
 fuse_config=/etc/fuse.conf
@@ -83,10 +72,10 @@ ulimit -n 1048576
 
 # Construct the service name with the local_mount suffix
 safe_mount_name=$(systemd-escape -p "${local_mount}")
-service_name="mount_parallelstore_${safe_mount_name}.service"
+service_name="mount_parallelstore_$${safe_mount_name}.service"
 
 # --- Begin: Add systemd service creation ---
-cat >/etc/systemd/system/"${service_name}" <<EOF
+cat >/etc/systemd/system/"$${service_name}" <<EOF
 [Unit]
 Description=DAOS Mount Service
 After=network-online.target daos_agent.service
@@ -100,8 +89,11 @@ User=root
 Group=root
 Restart=on-failure
 RestartSec=10
-ExecStart=/bin/dfuse -m $local_mount --pool default-pool --container default-container --multi-user $mount_options --foreground
-ExecStop=/usr/bin/fusermount3 -u $local_mount
+ExecStart=/bin/dfuse -m ${local_mount} --pool default-pool --container default-container --multi-user ${mount_options} --foreground
+ExecStop=fusermount3 -z -u '${local_mount}'
+%{ for env_key, env_value in dfuse_environment ~}
+Environment="${env_key}=${env_value}"
+%{ endfor ~}
 
 [Install]
 WantedBy=multi-user.target
@@ -111,8 +103,8 @@ EOF
 # brought in automatically by the mount units
 systemctl daemon-reload
 systemctl enable daos_agent.service
-systemctl enable "${service_name}"
-systemctl start "${service_name}"
+systemctl enable "$${service_name}"
+systemctl start "$${service_name}"
 # --- End: Add systemd service creation ---
 
 exit 0
diff --git a/modules/file-system/pre-existing-network-storage/variables.tf b/modules/file-system/pre-existing-network-storage/variables.tf
index eb792cf98a..c03f456539 100644
--- a/modules/file-system/pre-existing-network-storage/variables.tf
+++ b/modules/file-system/pre-existing-network-storage/variables.tf
@@ -41,4 +41,14 @@ variable "mount_options" {
   description = "Options describing various aspects of the file system. Consider adding setting to 'defaults,_netdev,implicit_dirs' when using gcsfuse."
   type        = string
   default     = "defaults,_netdev"
+  nullable    = false
+}
+
+variable "parallelstore_options" {
+  description = "Parallelstore specific options"
+  type = object({
+    daos_agent_config = optional(string, "")
+    dfuse_environment = optional(map(string), {})
+  })
+  default = {}
 }
diff --git a/modules/scripts/startup-script/templates/startup-script-custom.tftpl b/modules/scripts/startup-script/templates/startup-script-custom.tftpl
index 45902a48eb..3c894b00b0 100644
--- a/modules/scripts/startup-script/templates/startup-script-custom.tftpl
+++ b/modules/scripts/startup-script/templates/startup-script-custom.tftpl
@@ -31,7 +31,7 @@ stdlib::runner() {
   stdlib::info "=== start executing runner: $object ==="
   case "$1" in
     ansible-local) stdlib::run_playbook "$destpath/$filename" "$args";;
-    shell) chmod u+x /$destpath/$filename && ./$destpath/$filename $args;;
+    shell) chmod u+x /$destpath/$filename && $destpath/$filename $args;;
   esac
   
   exit_code=$?
diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py
index 703f00ff95..1c7a46eebd 100644
--- a/tools/duplicate-diff.py
+++ b/tools/duplicate-diff.py
@@ -83,8 +83,8 @@
         "modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh",
     ],
     [
-        "modules/file-system/parallelstore/scripts/mount-daos.sh",
-        "modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh",
+        "modules/file-system/parallelstore/templates/mount-daos.sh.tftpl",
+        "modules/file-system/pre-existing-network-storage/templates/mount-daos.sh.tftpl",
     ],
     [
         "modules/compute/vm-instance/compute_image.tf"

From e8646c087e6eb75f2de54b9fffb2be79bcc47a2a Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Mon, 23 Dec 2024 21:48:13 +0000
Subject: [PATCH 073/140] add null checks to placement policy checks

---
 modules/compute/gke-node-pool/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index 9a09712097..0d429165f3 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -308,7 +308,7 @@ resource "google_container_node_pool" "node_pool" {
       error_message = "At least one of max_unavailable or max_surge must greater than 0"
     }
     precondition {
-      condition     = var.placement_policy.type != "COMPACT" || length(var.zones) == 1
+      condition     = var.placement_policy.type != "COMPACT" || (var.zones != null ? (length(var.zones) == 1) : false)
       error_message = "Compact placement is only available for node pools operating in a single zone."
     }
     precondition {

From 3e333a7e2916338ec0addc7cc298a914eca0b842 Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Tue, 24 Dec 2024 09:06:47 +0000
Subject: [PATCH 074/140] make upgrade settings configurable for gke-cluster

---
 modules/scheduler/gke-cluster/README.md    |  1 +
 modules/scheduler/gke-cluster/main.tf      | 29 ++++++++++++++++++++--
 modules/scheduler/gke-cluster/variables.tf | 18 ++++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 675039add6..a850202096 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -191,6 +191,7 @@ limitations under the License.
 | <a name="input_system_node_pool_taints"></a> [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "components.gke.io/gke-managed-components",<br/>    "value": true<br/>  }<br/>]</pre> | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
+| <a name="input_upgrade_settings"></a> [upgrade\_settings](#input\_upgrade\_settings) | Defines gke cluster upgrade settings. It is highly recommended that you define all max\_surge and max\_unavailable.<br/>If max\_surge is not specified, it would be set to a default value of 0.<br/>If max\_unavailable is not specified, it would be set to a default value of 1. | <pre>object({<br/>    strategy        = string<br/>    max_surge       = optional(number)<br/>    max_unavailable = optional(number)<br/>  })</pre> | <pre>{<br/>  "max_surge": 0,<br/>  "max_unavailable": 1,<br/>  "strategy": "SURGE"<br/>}</pre> | no |
 | <a name="input_zone"></a> [zone](#input\_zone) | Zone for a zonal cluster. | `string` | `null` | no |
 
 ## Outputs
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 5b416a85bb..621189b19a 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -19,6 +19,14 @@ locals {
   labels = merge(var.labels, { ghpc_module = "gke-cluster", ghpc_role = "scheduler" })
 }
 
+locals {
+  upgrade_settings = {
+    strategy        = var.upgrade_settings.strategy
+    max_surge       = coalesce(var.upgrade_settings.max_surge, 0)
+    max_unavailable = coalesce(var.upgrade_settings.max_unavailable, 1)
+  }
+}
+
 locals {
   dash             = var.prefix_with_deployment_name && var.name_suffix != "" ? "-" : ""
   prefix           = var.prefix_with_deployment_name ? var.deployment_name : ""
@@ -243,8 +251,9 @@ resource "google_container_node_pool" "system_node_pools" {
   }
 
   upgrade_settings {
-    max_surge       = 1
-    max_unavailable = 0
+    strategy        = local.upgrade_settings.strategy
+    max_surge       = local.upgrade_settings.max_surge
+    max_unavailable = local.upgrade_settings.max_unavailable
   }
 
   management {
@@ -304,6 +313,22 @@ resource "google_container_node_pool" "system_node_pools" {
       node_config[0].labels,
       node_config[0].taint,
     ]
+    precondition {
+      condition     = contains(["SURGE"], local.upgrade_settings.strategy)
+      error_message = "Only SURGE strategy is supported"
+    }
+    precondition {
+      condition     = local.upgrade_settings.max_unavailable >= 0
+      error_message = "max_unavailable should be set to 0 or greater"
+    }
+    precondition {
+      condition     = local.upgrade_settings.max_surge >= 0
+      error_message = "max_surge should be set to 0 or greater"
+    }
+    precondition {
+      condition     = local.upgrade_settings.max_unavailable > 0 || local.upgrade_settings.max_surge > 0
+      error_message = "At least one of max_unavailable or max_surge must greater than 0"
+    }
   }
 }
 
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index 58bf197763..9b807e7826 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -407,3 +407,21 @@ variable "deletion_protection" {
   type        = bool
   default     = false
 }
+
+variable "upgrade_settings" {
+  description = <<-EOT
+  Defines gke cluster upgrade settings. It is highly recommended that you define all max_surge and max_unavailable.
+  If max_surge is not specified, it would be set to a default value of 0.
+  If max_unavailable is not specified, it would be set to a default value of 1.  
+  EOT
+  type = object({
+    strategy        = string
+    max_surge       = optional(number)
+    max_unavailable = optional(number)
+  })
+  default = {
+    strategy        = "SURGE"
+    max_surge       = 0
+    max_unavailable = 1
+  }
+}

From 4969446bf028a1d92e7be5e99c1a0dcb83e71780 Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Thu, 26 Dec 2024 19:23:00 +0000
Subject: [PATCH 075/140] update a3mega nccl plugin to 1.0.7 and rxdm to
 1.0.13_1

---
 modules/compute/gke-node-pool/gpu_direct.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 9403ea34fc..8f0e67a9b2 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -43,11 +43,11 @@ locals {
     "a3-megagpu-8g" = {
       # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
       gpu_direct_manifests = [
-        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml",    # nccl_plugin v1.0.4 for tcpxo
-        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
+        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/b324ec8994aa98ca320438dd2d01ff6d7f9165bb/gpudirect-tcpxo/nccl-tcpxo-installer.yaml",    # nccl_plugin v1.0.7 for tcpxo
+        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/b324ec8994aa98ca320438dd2d01ff6d7f9165bb/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
       updated_workload_path   = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
-      rxdm_version            = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
+      rxdm_version            = "v1.0.13_1" # matching nccl-tcpxo-installer version v1.0.7
       min_additional_networks = 8
       major_minor_version_acceptable_map = {
         "1.28" = "1.28.9-gke.1250000"

From 80ad9b13bc6128a7c291ffb1e58657e05af3987c Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Mon, 30 Dec 2024 21:35:07 +0000
Subject: [PATCH 076/140] add GKE support for managed hyperdisk

---
 examples/README.md                            |  24 ++
 examples/gke-managed-hyperdisk.yaml           | 218 ++++++++++++++++++
 modules/file-system/gke-storage/README.md     |   2 +-
 .../hyperdisk-balanced-pvc.yaml.tftpl         |  15 ++
 .../hyperdisk-extreme-pvc.yaml.tftpl          |  15 ++
 .../hyperdisk-throughput-pvc.yaml.tftpl       |  15 ++
 .../hyperdisk-balanced-sc.yaml.tftpl          |  25 ++
 .../hyperdisk-extreme-sc.yaml.tftpl           |  24 ++
 .../hyperdisk-throughput-sc.yaml.tftpl        |  24 ++
 modules/file-system/gke-storage/variables.tf  |   7 +-
 .../test-gke-managed-hyperdisk.yml            |  41 ++++
 .../builds/gke-managed-hyperdisk.yaml         |  55 +++++
 .../tests/gke-managed-hyperdisk.yml           |  29 +++
 13 files changed, 491 insertions(+), 3 deletions(-)
 create mode 100644 examples/gke-managed-hyperdisk.yaml
 create mode 100644 modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl
 create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml
 create mode 100644 tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml

diff --git a/examples/README.md b/examples/README.md
index 29db27df94..a1d3d0c589 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1518,6 +1518,30 @@ cleaned up when the job is deleted.
 
 [storage-gke.yaml]: ../examples/storage-gke.yaml
 
+### [gke-managed-hyperdisk.yaml] ![core-badge] ![experimental-badge]
+
+This blueprint shows how to use managed hyperdisk storage options with GKE in the toolkit.
+
+The blueprint contains the following:
+
+* A K8s Job that uses a managed hyperdisk storage volume option.
+* A K8s Job that demonstrates ML training workload with managed hyperdisk storage disk operation.
+
+> **Warning**: In this example blueprint, when storage type `Hyperdisk-balanced`, `Hyperdisk-extreme` or `Hyperdisk-throughput` is specified in `gke-storage` module.
+> The lifecycle of the hyperdisk is managed by the blueprint.
+> On glcuster destroy operation, the hyperdisk storage created will also be destroyed.
+>
+> [!Note]
+> The Kubernetes API server will only allow requests from authorized networks.
+> The `gke-cluster` module needs access to the Kubernetes API server
+> to create a Persistent Volume and a Persistent Volume Claim. **You must use
+> the `authorized_cidr` variable to supply an authorized network which contains
+> the IP address of the machine deploying the blueprint, for example
+> `--vars authorized_cidr=<your-ip-address>/32`.** You can use a service like
+> [whatismyip.com](https://whatismyip.com) to determine your IP address.
+
+[gke-managed-hyperdisk.yaml]: ../examples/gke-managed-hyperdisk.yaml
+
 ### [gke-managed-parallelstore.yaml] ![core-badge] ![experimental-badge]
 
 This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit.
diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml
new file mode 100644
index 0000000000..12c8063026
--- /dev/null
+++ b/examples/gke-managed-hyperdisk.yaml
@@ -0,0 +1,218 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+blueprint_name: gke-storage-hyperdisk
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: gke-storage-hyperdisk
+  region: us-central1
+  zone: us-central1-c
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: <your-ip-address>/32
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network
+    source: modules/network/vpc
+    settings:
+      subnetwork_name: gke-subnet-hyperdisk
+      secondary_ranges:
+        gke-subnet-hyperdisk:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network]
+    settings:
+      enable_persistent_disk_csi: true # enable Hyperdisk for the cluster
+      configure_workload_identity_sa: true
+      enable_private_endpoint: false  # Allows for access from authorized public IPs
+      master_authorized_networks:
+      - display_name: deployment-machine
+        cidr_block: $(vars.authorized_cidr)
+    outputs: [instructions]
+
+  ### Set up storage class and persistent volume claim for Hyperdisk ###
+  - id: hyperdisk-balanced-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-balanced
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 100
+
+  - id: hyperdisk-throughput-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-throughput
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 5000
+
+  - id: hyperdisk-extreme-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-extreme
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 100
+
+  - id: sample-pool
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster]
+    settings:
+      name: sample-pool
+      zones: [$(vars.zone)]
+      machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
+
+  # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-balanced-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-balanced-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
+
+  # Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-extreme-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-extreme-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
+
+  # Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-throughput-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-throughput-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index 953ee8b254..9d7a2fb428 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -118,7 +118,7 @@ No resources.
 | <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
 | <a name="input_sc_topology_zones"></a> [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no |
 | <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br/>Supported value:<br/>- Immediate<br/>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
-| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore | `string` | n/a | yes |
+| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore<br/>- Hyperdisk-balanced<br/>- Hyperdisk-throughput<br/>- Hyperdisk-extreme | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl
new file mode 100644
index 0000000000..32781be2fb
--- /dev/null
+++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl
new file mode 100644
index 0000000000..32781be2fb
--- /dev/null
+++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl
new file mode 100644
index 0000000000..32781be2fb
--- /dev/null
+++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl
new file mode 100644
index 0000000000..46e1f023d3
--- /dev/null
+++ b/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl
@@ -0,0 +1,25 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+provisioner: pd.csi.storage.gke.io
+allowVolumeExpansion: true
+parameters:
+  type: hyperdisk-balanced
+  provisioned-throughput-on-create: "250Mi"
+  provisioned-iops-on-create: "7000"
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}
diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl
new file mode 100644
index 0000000000..445020d001
--- /dev/null
+++ b/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl
@@ -0,0 +1,24 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+provisioner: pd.csi.storage.gke.io
+allowVolumeExpansion: true
+parameters:
+  %{~ endfor ~}
+  type: hyperdisk-extreme
+  provisioned-iops-on-create: "50000"
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}
diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl
new file mode 100644
index 0000000000..ec404aec45
--- /dev/null
+++ b/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl
@@ -0,0 +1,24 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+provisioner: pd.csi.storage.gke.io
+allowVolumeExpansion: true
+parameters:
+  type: hyperdisk-throughput
+  provisioned-throughput-on-create: "250Mi"
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
index 9efbe6082c..b33203be0f 100644
--- a/modules/file-system/gke-storage/variables.tf
+++ b/modules/file-system/gke-storage/variables.tf
@@ -30,12 +30,15 @@ variable "storage_type" {
   The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
   to used. This module currently support dynamic provisioning for the below storage options
   - Parallelstore
+  - Hyperdisk-balanced
+  - Hyperdisk-throughput
+  - Hyperdisk-extreme
   EOT 
   type        = string
   nullable    = false
   validation {
-    condition     = var.storage_type == null ? false : contains(["parallelstore"], lower(var.storage_type))
-    error_message = "Allowed string values for var.storage_type are \"Parallelstore\"."
+    condition     = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type))
+    error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"."
   }
 }
 
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml
new file mode 100644
index 0000000000..fb114c402a
--- /dev/null
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml
@@ -0,0 +1,41 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+- name: Get cluster credentials for kubectl
+  delegate_to: localhost
+  ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }}
+
+- name: Execute the job
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
+    for job in "${jobs[@]}"; do
+      kubectl create -f "$job"
+    done
+  args:
+    executable: /bin/bash
+  changed_when: False
+
+- name: Wait for job to complete
+  delegate_to: localhost
+  ansible.builtin.command: |
+    kubectl get job --field-selector  status.successful=1
+  register: job_completion
+  until: job_completion.stdout_lines | length > 3 # 3 jobs total
+  retries: 80
+  delay: 15
+
+- name: Print job_completion debug output
+  ansible.builtin.debug:
+    var: job_completion.stdout_lines
diff --git a/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml
new file mode 100644
index 0000000000..64129fcdde
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml
@@ -0,0 +1,55 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.gke-cluster
+- m.gke-job-template
+- m.gke-node-pool
+- m.gke-storage
+- m.vpc
+- gke
+
+timeout: 14400s  # 4hr
+
+steps:
+## Test GKE
+- id: gke-managed-hyperdisk
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    SG_EXAMPLE=examples/gke-managed-hyperdisk.yaml
+    # adding vm to act as remote node
+    echo '  - id: remote-node'                     >> $${SG_EXAMPLE}
+    echo '    source: modules/compute/vm-instance' >> $${SG_EXAMPLE}
+    echo '    use: [network]'                      >> $${SG_EXAMPLE}
+    echo '    settings:'                           >> $${SG_EXAMPLE}
+    echo '      machine_type: e2-standard-2'       >> $${SG_EXAMPLE}
+    echo '      zone: us-central1-a'               >> $${SG_EXAMPLE}
+    # avoids conflict with other tests
+    sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE}
+    IP=$(curl ifconfig.me)
+    sed -i "s/<your-ip-address>/$${IP}/" $${SG_EXAMPLE}
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml b/tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml
new file mode 100644
index 0000000000..036657720a
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+test_name: gke-managed-hyperdisk
+deployment_name: gke-managed-hyperdisk-{{ build }}
+zone: us-central1-a  # for remote node
+region: us-central1
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/gke-managed-hyperdisk.yaml"
+network: "{{ deployment_name }}-net"
+remote_node: "{{ deployment_name }}-0"
+post_deploy_tests:
+- test-validation/test-gke-managed-hyperdisk.yml
+custom_vars:
+  project: "{{ project }}"
+cli_deployment_vars:
+  region: "{{ region }}"
+  gcp_public_cidrs_access_enabled: true

From b874f82a73c53bb3c752aac8a213a08bba07598f Mon Sep 17 00:00:00 2001
From: Indraneel Ghosh <indraneelghosh@google.com>
Date: Tue, 31 Dec 2024 14:09:16 +0530
Subject: [PATCH 077/140] Update Stackdrier typo in README.md

---
 modules/scripts/startup-script/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md
index 67f0effe7b..48d79f872b 100644
--- a/modules/scripts/startup-script/README.md
+++ b/modules/scripts/startup-script/README.md
@@ -143,7 +143,7 @@ recommends using the _Cloud Ops Agent_, it is recommended to use
 #### Stackdriver Agent Installation
 
 If an image or machine already has Cloud Ops Agent installed and you would like
-to instead use the Stackdrier Agent, the following script will remove the Cloud
+to instead use the Stackdriver Agent, the following script will remove the Cloud
 Ops Agent and install the Stackdriver Agent.
 
 ```bash

From 0fbbc7c1742625c65d5473dc30d226cdadbf8ae3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Tue, 31 Dec 2024 10:07:43 +0000
Subject: [PATCH 078/140] Fix failures if not specifing stipe sizes for
 parallelstore

---
 modules/file-system/parallelstore/README.md    | 4 ++--
 modules/file-system/parallelstore/variables.tf | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md
index 4a5896e217..9b0595c965 100644
--- a/modules/file-system/parallelstore/README.md
+++ b/modules/file-system/parallelstore/README.md
@@ -169,8 +169,8 @@ No modules.
 | <a name="input_daos_agent_config"></a> [daos\_agent\_config](#input\_daos\_agent\_config) | Additional configuration to be added to daos\_config.yml | `string` | `""` | no |
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes |
 | <a name="input_dfuse_environment"></a> [dfuse\_environment](#input\_dfuse\_environment) | Additional environment variables for DFuse process | `map(string)` | `{}` | no |
-| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | The parallelstore stripe level for directories. | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
-| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | The parallelstore stripe level for files. | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
+| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | The parallelstore stripe level for directories. | `string` | `null` | no |
+| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | The parallelstore stripe level for files. | `string` | `null` | no |
 | <a name="input_import_destination_path"></a> [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no |
 | <a name="input_import_gcs_bucket_uri"></a> [import\_gcs\_bucket\_uri](#input\_import\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no |
diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf
index 836f443f19..d5b7e7a19a 100644
--- a/modules/file-system/parallelstore/variables.tf
+++ b/modules/file-system/parallelstore/variables.tf
@@ -109,9 +109,9 @@ variable "import_destination_path" {
 variable "file_stripe" {
   description = "The parallelstore stripe level for files."
   type        = string
-  default     = "FILE_STRIPE_LEVEL_UNSPECIFIED"
+  default     = null
   validation {
-    condition = contains([
+    condition = var.file_stripe == null ? true : contains([
       "FILE_STRIPE_LEVEL_UNSPECIFIED",
       "FILE_STRIPE_LEVEL_MIN",
       "FILE_STRIPE_LEVEL_BALANCED",
@@ -124,9 +124,9 @@ variable "file_stripe" {
 variable "directory_stripe" {
   description = "The parallelstore stripe level for directories."
   type        = string
-  default     = "DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"
+  default     = null
   validation {
-    condition = contains([
+    condition = var.directory_stripe == null ? true : contains([
       "DIRECTORY_STRIPE_LEVEL_UNSPECIFIED",
       "DIRECTORY_STRIPE_LEVEL_MIN",
       "DIRECTORY_STRIPE_LEVEL_BALANCED",

From 7605aaa5b7d999f65865b34e4166a8c1bd7c6d5e Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Fri, 3 Jan 2025 00:20:13 +0000
Subject: [PATCH 079/140] address comment

---
 examples/gke-managed-hyperdisk.yaml     | 11 +++++++++--
 examples/gke-managed-parallelstore.yaml |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml
index 12c8063026..4be8bcf83c 100644
--- a/examples/gke-managed-hyperdisk.yaml
+++ b/examples/gke-managed-hyperdisk.yaml
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-blueprint_name: gke-storage-hyperdisk
+blueprint_name: gke-managed-hyperdisk
 vars:
   project_id:  ## Set GCP Project ID Here ##
-  deployment_name: gke-storage-hyperdisk
+  deployment_name: gke-managed-hyperdisk
   region: us-central1
   zone: us-central1-c
 
@@ -41,12 +41,18 @@ deployment_groups:
     source: modules/scheduler/gke-cluster
     use: [network]
     settings:
+      release_channel: RAPID
       enable_persistent_disk_csi: true # enable Hyperdisk for the cluster
       configure_workload_identity_sa: true
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
       - display_name: deployment-machine
         cidr_block: $(vars.authorized_cidr)
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
     outputs: [instructions]
 
   ### Set up storage class and persistent volume claim for Hyperdisk ###
@@ -93,6 +99,7 @@ deployment_groups:
       name: sample-pool
       zones: [$(vars.zone)]
       machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
+      auto_upgrade: true
 
   # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
   # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
diff --git a/examples/gke-managed-parallelstore.yaml b/examples/gke-managed-parallelstore.yaml
index 4425f13181..6f292e0bb6 100644
--- a/examples/gke-managed-parallelstore.yaml
+++ b/examples/gke-managed-parallelstore.yaml
@@ -63,6 +63,7 @@ deployment_groups:
     source: modules/scheduler/gke-cluster
     use: [network]
     settings:
+      release_channel: RAPID
       enable_parallelstore_csi: true # enable Parallelstore for the cluster
       configure_workload_identity_sa: true
       enable_private_endpoint: false  # Allows for access from authorized public IPs
@@ -70,6 +71,11 @@ deployment_groups:
       master_authorized_networks:
       - display_name: deployment-machine
         cidr_block: $(vars.authorized_cidr)
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
     outputs: [instructions]
 
   ### Set up storage class and persistent volume claim for Parallelstore ###
@@ -92,6 +98,7 @@ deployment_groups:
       name: sample-pool
       zones: [$(vars.zone)]
       machine_type: n2-standard-16
+      auto_upgrade: true
 
   # Train a TensorFlow model with Keras and Parallelstore on GKE
   # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample

From b390c5ffaeecd8ec0ac74a036423ed32a448c54c Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Fri, 3 Jan 2025 20:42:45 +0000
Subject: [PATCH 080/140] update README for managed-hyperdiska nd
 managed-parallelstore example blueprint

---
 examples/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index a1d3d0c589..d268dcd423 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1526,6 +1526,8 @@ The blueprint contains the following:
 
 * A K8s Job that uses a managed hyperdisk storage volume option.
 * A K8s Job that demonstrates ML training workload with managed hyperdisk storage disk operation.
+  * The sample training workload manifest will be generated under the gke-managed-hyperdisk/primary folder, as tensorflow-GUID.yaml
+  * You can deploy this sample training workload using "kubectl apply -f tensorflow-GUID.yaml" to start the training
 
 > **Warning**: In this example blueprint, when storage type `Hyperdisk-balanced`, `Hyperdisk-extreme` or `Hyperdisk-throughput` is specified in `gke-storage` module.
 > The lifecycle of the hyperdisk is managed by the blueprint.
@@ -1550,6 +1552,8 @@ The blueprint contains the following:
 
 * A K8s Job that uses a managed parallelstore storage volume option.
 * A K8s Job that demonstrates ML training workload with managed parallelstore storage disk operation.
+  * The sample training workload manifest will be generated under the gke-managed-parallelstore/primary folder, as tensorflow-GUID.yaml
+  * You can deploy this sample training workload using "kubectl apply -f tensorflow-GUID.yaml" to start the training
 
 > **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module.
 > The lifecycle of the parallelstore is managed by the blueprint.

From 526b171324dd6dfa21d15516b1dc22ff8c4bd5c5 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Mon, 16 Dec 2024 19:45:27 +0000
Subject: [PATCH 081/140] Document unsupported "bracket-less" collection
 addressing

---
 examples/README.md            | 4 ++++
 pkg/config/expression.go      | 8 ++++++++
 pkg/config/expression_test.go | 4 ++++
 3 files changed, 16 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 30883ce0f9..82539448ca 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1884,6 +1884,10 @@ To learn more about how to refer to a module in a blueprint file, please consult
 Variables can be used to refer both to values defined elsewhere in the blueprint
 and to the output and structure of other modules.
 
+> [!NOTE]
+> "Brackets-less" access to elements of collection is not supported, use brackets.
+> E.g. `pink.lime[0].salmon` instead of `pink.lime.0.salmon`.
+
 ### Blueprint expressions
 
 Expressions in a blueprint file can refer to deployment variables or the outputs
diff --git a/pkg/config/expression.go b/pkg/config/expression.go
index 3cfeb096d1..0fb75f71fd 100644
--- a/pkg/config/expression.go
+++ b/pkg/config/expression.go
@@ -88,6 +88,14 @@ func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) {
 
 // BlueprintExpressionLiteralToExpression takes  a content of `$(...)`-literal and transforms it to `Expression`
 func BlueprintExpressionLiteralToExpression(s string) (Expression, error) {
+	// TODO: FIX: this function relies on assumption that
+	// `epxrToTokens(toExpression(tokenize(X))) == tokenize(X)`
+	// This is not correct, e.g.:
+	// ```
+	// epxrToTokens(toExpression(tokenize("pink.lime.0.salmon"))) ==
+	//     tokenize("pink.lime[0].salmon") != tokenize("pink.lime.0.salmon")
+	// ```
+	// As a result `pink.lime.0.salmon` can not be properly translated.
 	bpExp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{})
 	if diag.HasErrors() {
 		return nil, diag
diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go
index 88a8fa8338..e7a135846e 100644
--- a/pkg/config/expression_test.go
+++ b/pkg/config/expression_test.go
@@ -85,6 +85,7 @@ func TestParseBpLit(t *testing.T) {
 		{"$(vars.green.sleeve)", "var.green.sleeve", false},
 		{`$(vars.green["sleeve"])`, `var.green["sleeve"]`, false},
 		{"$(vars.green.sleeve[3])", "var.green.sleeve[3]", false},
+		{"$(vars.green[3].sleeve)", "var.green[3].sleeve", false},
 
 		{"$(var.green)", "module.var.green", false},
 		{"$(box.green)", "module.box.green", false},
@@ -135,6 +136,9 @@ echo "Hello $(vars.project_id)"
 		{"$(vars[3]])", "", true},      // can't index vars
 		{`$(vars["green"])`, "", true}, // can't index module
 
+		// TODO: uncomment
+		// see comment to `BlueprintExpressionLiteralToExpression`
+		// {"$(pink.lime.0.salmon)", "module.pink.lime[0].salmon", false},
 	}
 	for _, tc := range tests {
 		t.Run(tc.input, func(t *testing.T) {

From b76bcc22d0271e5326c6860111c50071d2f88a3c Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Sat, 4 Jan 2025 01:22:47 +0000
Subject: [PATCH 082/140] Remove provisioningModel from future reservations

---
 .../modules/slurm_files/scripts/resume.py                 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index fa5413e53c..7bec9be1a1 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -103,10 +103,10 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str]
         props.resourcePolicies = [placement_group]
 
     if reservation := lookup().nodeset_reservation(nodeset):
-        update_reservation_props(reservation, props, placement_group, False)
+        update_reservation_props(reservation, props, placement_group)
 
     if (fr := lookup().future_reservation(nodeset)) and fr.specific:
-        update_reservation_props(fr.active_reservation, props, placement_group, True)
+        update_reservation_props(fr.active_reservation, props, placement_group)
 
     if props.resourcePolicies:
        props.scheduling.onHostMaintenance = "TERMINATE"
@@ -121,14 +121,14 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str]
     props.update(nodeset.get("instance_properties") or {})
     return props
 
-def update_reservation_props(reservation:object, props:object, placement_group:Optional[str], reservation_from_fr:bool) -> None:
+def update_reservation_props(reservation:object, props:object, placement_group:Optional[str]) -> None:
     props.reservationAffinity = {
         "consumeReservationType": "SPECIFIC_RESERVATION",
         "key": f"compute.{util.universe_domain()}/reservation-name",
         "values": [reservation.bulk_insert_name],
     }
 
-    if reservation.dense or reservation_from_fr:
+    if reservation.dense:
         props.scheduling.provisioningModel = "RESERVATION_BOUND"
 
     # Figure out `resourcePolicies`

From 203da782fa54a041bf051bf749cd01d860f483aa Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Mon, 6 Jan 2025 14:24:31 +0000
Subject: [PATCH 083/140] Remove slurm-gcp v5 tests

---
 .../daily-tests/builds/hcls-v5-legacy.yaml    | 69 --------------
 .../hpc-enterprise-slurm-v5-legacy.yaml       | 46 ---------
 .../hpc-slurm-chromedesktop-v5-legacy.yaml    | 45 ---------
 .../builds/lustre-slurm-v5-legacy.yaml        | 43 ---------
 .../builds/ml-a3-highgpu-slurm-v5.yaml        | 93 -------------------
 .../builds/ml-slurm-v5-legacy.yaml            | 48 ----------
 .../daily-tests/builds/packer-v5-legacy.yaml  | 46 ---------
 .../builds/slurm-gcp-v5-debian.yaml           | 43 ---------
 .../builds/slurm-gcp-v5-hpc-centos7.yaml      | 42 ---------
 .../builds/slurm-gcp-v5-rocky8.yaml           | 43 ---------
 ...lurm-gcp-v5-startup-scripts-v5-legacy.yaml | 45 ---------
 .../builds/slurm-gcp-v5-ubuntu2004.yaml       | 43 ---------
 .../daily-tests/tests/hcls-v5-legacy.yml      | 45 ---------
 .../tests/hpc-enterprise-slurm-v5-legacy.yml  | 50 ----------
 .../tests/hpc-slurm-chromedesktop.yml         | 42 ---------
 .../tests/lustre-slurm-v5-legacy.yml          | 43 ---------
 .../ml-a3-highgpu-slurm-cluster-legacy.yml    | 49 ----------
 .../daily-tests/tests/ml-slurm-v5-legacy.yml  | 23 -----
 .../daily-tests/tests/packer-v5-legacy.yml    | 27 ------
 .../daily-tests/tests/slurm-v5-debian.yml     | 45 ---------
 .../tests/slurm-v5-hpc-centos7.yml            | 44 ---------
 .../daily-tests/tests/slurm-v5-rocky8.yml     | 45 ---------
 .../tests/slurm-v5-startup-scripts.yml        | 38 --------
 .../daily-tests/tests/slurm-v5-ubuntu.yml     | 40 --------
 24 files changed, 1097 deletions(-)
 delete mode 100644 tools/cloud-build/daily-tests/builds/hcls-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/lustre-slurm-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm-v5.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/packer-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-ubuntu2004.yaml
 delete mode 100644 tools/cloud-build/daily-tests/tests/hcls-v5-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm-v5-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/lustre-slurm-v5-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/ml-slurm-v5-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/packer-v5-legacy.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-startup-scripts.yml
 delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-ubuntu.yml

diff --git a/tools/cloud-build/daily-tests/builds/hcls-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/hcls-v5-legacy.yaml
deleted file mode 100644
index 0f39d815ee..0000000000
--- a/tools/cloud-build/daily-tests/builds/hcls-v5-legacy.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.chrome-remote-desktop
-- m.cloud-storage-bucket
-- m.dashboard
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.service-enablement
-- m.spack-execute
-- m.spack-setup
-- m.startup-script
-- m.vm-instance
-- m.vpc
-- spack
-- crd
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-# While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests)
-- id: check_for_running_build
-  name: gcr.io/cloud-builders/gcloud
-  entrypoint: /bin/bash
-  args:
-  - -c
-  - |
-    set -x -e
-    echo $TRIGGER_BUILD_CONFIG_PATH
-    MATCHING_BUILDS=$(gcloud builds list --ongoing --format 'value(id)' --filter='substitutions.TRIGGER_BUILD_CONFIG_PATH="$TRIGGER_BUILD_CONFIG_PATH"')
-    MATCHING_COUNT=$(echo $$MATCHING_BUILDS | wc -w)
-    if [ "$$MATCHING_COUNT" -gt 1 ]; then
-        echo "Found more than 1 matching running builds"
-        echo "$$MATCHING_BUILDS"
-        exit 1
-    fi
-- id: hcls
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/hcls-v5-legacy.yml"
diff --git a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm-v5-legacy.yaml
deleted file mode 100644
index 008f2939e8..0000000000
--- a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.DDN-EXAScaler
-- m.dashboard
-- m.filestore
-- m.pre-existing-vpc
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.service-account
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: hpc-enterprise-slurm
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm-v5-legacy.yml"
diff --git a/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop-v5-legacy.yaml
deleted file mode 100644
index 7e4a4acb56..0000000000
--- a/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop-v5-legacy.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.chrome-remote-desktop
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.vpc
-- crd
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: hpc-slurm-chromedesktop
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml"
diff --git a/tools/cloud-build/daily-tests/builds/lustre-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/lustre-slurm-v5-legacy.yaml
deleted file mode 100644
index 7088e64267..0000000000
--- a/tools/cloud-build/daily-tests/builds/lustre-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.DDN-EXAScaler
-- m.pre-existing-vpc
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: lustre-slurm
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/lustre-slurm-v5-legacy.yml"
diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm-v5.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm-v5.yaml
deleted file mode 100644
index d44c7baec1..0000000000
--- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm-v5.yaml
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.custom-image
-- m.pre-existing-vpc
-- m.startup-script
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: ml-a3-highgpu-slurm-image
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \
-        --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-        --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image-legacy.yml"
-    IMAGE_NAME=$(gcloud compute images list --project "${PROJECT_ID}" \
-        --no-standard-images --filter="labels.ghpc_deployment~$${BUILD_ID_SHORT}" \
-        --format='get(name)' --limit=1)
-
-    echo $${IMAGE_NAME} > /persistent_volume/image_name
-  volumes:
-  - name: 'persistent_volume'
-    path: '/persistent_volume'
-- id: ml-a3-highgpu-slurm-cluster
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-    NFS_DEPLOYMENT_NAME="a3hnfs$${BUILD_ID_SHORT}"
-
-    destroy_on_exit() {
-        ./gcluster destroy "$${NFS_DEPLOYMENT_NAME}" --auto-approve
-        cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet
-    }
-
-    REGION=us-west1
-    ZONE=us-west1-a
-
-    trap 'destroy_on_exit' EXIT
-
-    ./gcluster deploy \
-        --vars region="$${REGION}" \
-        --vars zone="$${ZONE}" \
-        --vars project_id="${PROJECT_ID}" \
-        --vars deployment_name="$${NFS_DEPLOYMENT_NAME}" \
-        tools/cloud-build/daily-tests/blueprints/nfs-server-homefs.yaml \
-        --auto-approve
-
-    NFS_IP=$(gcloud compute instances list --project "${PROJECT_ID}" \
-        --filter="labels.ghpc_module=nfs-server and labels.ghpc_deployment=$${NFS_DEPLOYMENT_NAME}" \
-        --format='get(networkInterfaces[0].networkIP)')
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 \
-      --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} nfs_ip=$${NFS_IP}" \
-      --extra-vars="region=$${REGION} zone=$${ZONE} remote_mount_homefs=/exports/home" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster-legacy.yml"
-  volumes:
-  - name: 'persistent_volume'
-    path: '/persistent_volume'
diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml
deleted file mode 100644
index 3382f342b6..0000000000
--- a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.custom-image
-- m.filestore
-- m.firewall-rules
-- m.pre-existing-vpc
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.startup-script
-- slurm5
-
-timeout: 18000s  # 5hr
-steps:
-# test image creation by provisioning a new VPC and using Packer to build an
-# image in it
-- id: ml-slurm
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/ml-slurm-v5-legacy.yml"
diff --git a/tools/cloud-build/daily-tests/builds/packer-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/packer-v5-legacy.yaml
deleted file mode 100644
index a2f2c32296..0000000000
--- a/tools/cloud-build/daily-tests/builds/packer-v5-legacy.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.custom-image
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.startup-script
-- m.vpc
-- packer
-
-timeout: 14400s  # 4hr
-steps:
-# test image creation by provisioning a new VPC and using Packer to build an
-# image in it
-- id: packer
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/packer-v5-legacy.yml"
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml
deleted file mode 100644
index 15c0c35650..0000000000
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.vpc
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: slurm-gcp-v5-debian
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml"
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml
deleted file mode 100644
index ed48e66298..0000000000
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.vpc
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: slurm-gcp-v5-hpc-centos7
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml"
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml
deleted file mode 100644
index 562f1f4277..0000000000
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.vpc
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: slurm-gcp-v5-rocky8
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml"
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-startup-scripts-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
deleted file mode 100644
index d7221cb59a..0000000000
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.cloud-storage-bucket
-- m.nfs-server
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.startup-script
-- m.vpc
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: slurm-gcp-v5-startup-scripts
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-startup-scripts.yml"
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-ubuntu2004.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-ubuntu2004.yaml
deleted file mode 100644
index 67db53434b..0000000000
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-ubuntu2004.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.filestore
-- m.schedmd-slurm-gcp-v5-controller
-- m.schedmd-slurm-gcp-v5-login
-- m.schedmd-slurm-gcp-v5-node-group
-- m.schedmd-slurm-gcp-v5-partition
-- m.vpc
-- slurm5
-
-timeout: 14400s  # 4hr
-steps:
-- id: slurm-gcp-v5-ubuntu2004
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-      --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-ubuntu.yml"
diff --git a/tools/cloud-build/daily-tests/tests/hcls-v5-legacy.yml b/tools/cloud-build/daily-tests/tests/hcls-v5-legacy.yml
deleted file mode 100644
index 073e773d2c..0000000000
--- a/tools/cloud-build/daily-tests/tests/hcls-v5-legacy.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hcls-cluster
-deployment_name: "hcls-{{ build }}"
-# No non-alphanumerical characters in the slurm cluster name - they will be
-# removed by Cluster Toolkit slurm wrappers, which will break the playbook
-slurm_cluster_name: "hcls{{ build[0:6] }}"
-zone: europe-west1-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/docs/videos/healthcare-and-life-sciences/hcls-blueprint-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  region: europe-west1
-  zone: "{{ zone }}"
-  disable_login_public_ips: "false"
-  disable_controller_public_ips: "false"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  mounts:
-  - /home
-  - /apps
-  - /data_input
-  - /data_output
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm-v5-legacy.yml b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm-v5-legacy.yml
deleted file mode 100644
index 4457c03587..0000000000
--- a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm-v5-legacy.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-test_name: hpc-enterprise-slurm
-deployment_name: "enter-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "enter{{ build[0:5] }}"
-zone: europe-west1-d
-cli_deployment_vars:
-  region: europe-west1
-  zone: "{{ zone }}"
-  zones: "[europe-west1-b,europe-west1-c,europe-west1-d]"
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/hpc-enterprise-slurm-v5-legacy.yaml"
-network: "default"
-# Note: Pattern matching in gcloud only supports 1 wildcard.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - n2
-  - c2
-  - c2d
-  # Disable those partitions for now.
-  # Note the current selected region may not support some of these partitions
-  # consult with https://cloud.google.com/compute/docs/regions-zones/
-  #- c3
-  #- a208
-  #- a216
-  mounts:
-  - /home
-  - /projects
-  - /scratch
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml b/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml
deleted file mode 100644
index 700a3a1807..0000000000
--- a/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-test_name: slurm-crd
-deployment_name: "slm-crd-{{ build }}"
-
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "slmcrd{{ build[0:4] }}"
-zone: europe-west1-c
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  region: europe-west1
-  zone: "{{ zone }}"
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-crd.yml
-custom_vars:
-  mounts:
-  - /home
-  partitions:
-  - desktop
-  - compute
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/lustre-slurm-v5-legacy.yml b/tools/cloud-build/daily-tests/tests/lustre-slurm-v5-legacy.yml
deleted file mode 100644
index 6cd001d1c3..0000000000
--- a/tools/cloud-build/daily-tests/tests/lustre-slurm-v5-legacy.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-test_name: test-slurm-lustre
-deployment_name: "lustr-{{ build }}"
-region: us-central1
-zone: us-central1-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml"
-network: "default"
-slurm_cluster_name: "lustr{{ build[0:5] }}"
-cli_deployment_vars:
-  region: "{{ region }}"
-  zone: "{{ zone }}"
-# Note: Pattern matching in gcloud only supports 1 wildcard.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-- test-validation/test-lustre-slurm.yml
-custom_vars:
-  output_dir: /lustre/test
-  num_slurm_nodes: 1
-  mounts:
-  - /lustre
-  partitions:
-  - centos
-  - rocky
-  # - ubuntu
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster-legacy.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster-legacy.yml
deleted file mode 100644
index 1172471ce0..0000000000
--- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster-legacy.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-# region, zone, nfs_ip, remote_mount_homefs, must be defined in build file
-# with --extra-vars flag!
-test_name: a3h-cluster
-deployment_name: a3hc-{{ build }}
-slurm_cluster_name: "a3hc{{ build[0:4] }}"
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml"
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-network: default
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-# v5 solutions do not have post 3.5.0 fix for enroot with service accounts
-# if this changes, reinsert this test
-# - test-validation/test-enroot.yml
-custom_vars:
-  partitions:
-  - a3
-  - debug
-  mounts:
-  - /home
-cli_deployment_vars:
-  network_name_system: default
-  subnetwork_name_system: default
-  region: "{{ region }}"
-  zone: "{{ zone }}"
-  server_ip_homefs: "{{ nfs_ip }}"
-  remote_mount_homefs: "{{ remote_mount_homefs }}"
-  slurm_cluster_name: "{{ slurm_cluster_name }}"
-  a3_static_cluster_size: 2
-  disable_login_public_ips: false
-  disable_controller_public_ips: false
diff --git a/tools/cloud-build/daily-tests/tests/ml-slurm-v5-legacy.yml b/tools/cloud-build/daily-tests/tests/ml-slurm-v5-legacy.yml
deleted file mode 100644
index 5fbb9315e8..0000000000
--- a/tools/cloud-build/daily-tests/tests/ml-slurm-v5-legacy.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: ml-slurm
-deployment_name: ml-slurm-{{ build }}
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/ml-slurm-v5-legacy.yaml"
-packer_group_name: packer
-packer_module_id: custom-image
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/packer-v5-legacy.yml b/tools/cloud-build/daily-tests/tests/packer-v5-legacy.yml
deleted file mode 100644
index 07158a5a0f..0000000000
--- a/tools/cloud-build/daily-tests/tests/packer-v5-legacy.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: image-builder
-deployment_name: pkr{{ build }}
-zone: us-central1-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/image-builder-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-packer_group_name: packer
-packer_module_id: custom-image
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  subnetwork_name: "{{ network }}-sub"
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml
deleted file mode 100644
index 2a06c30571..0000000000
--- a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hpc-slurm-debian
-deployment_name: "debi-v5-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "debiv5{{ build[0:4] }}"
-
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  instance_image: "{family: slurm-gcp-5-12-debian-11, project: schedmd-slurm-public}"
-  region: us-west4
-  zone: us-west4-c
-
-zone: us-west4-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  - debug
-  mounts:
-  - /home
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml
deleted file mode 100644
index 52400b9e66..0000000000
--- a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hpc-slurm
-deployment_name: "cent-v5-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "centv5{{ build[0:4] }}"
-zone: us-west4-c
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  enable_cleanup_compute: true
-  region: us-west4
-  zone: "{{ zone }}"
-  zones: "[us-west4-a,us-west4-b,us-west4-c]"
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/hpc-slurm-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  - debug
-  mounts:
-  - /home
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml
deleted file mode 100644
index cb76a571b4..0000000000
--- a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hpc-slurm-rocky8
-deployment_name: "rock-8-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "rock8{{ build[0:5] }}"
-
-cli_deployment_vars:
-  network_name: "{{ network }}"
-  instance_image: "{family: slurm-gcp-5-12-hpc-rocky-linux-8, project: schedmd-slurm-public}"
-  region: us-west4
-  zone: us-west4-c
-
-zone: us-west4-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  - debug
-  mounts:
-  - /home
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-startup-scripts.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-startup-scripts.yml
deleted file mode 100644
index 9037211bb2..0000000000
--- a/tools/cloud-build/daily-tests/tests/slurm-v5-startup-scripts.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hpc-cluster-slurm-v5
-deployment_name: "ss-v5-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "ssv5{{ build[0:6] }}"
-zone: us-west4-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  - debug
-  mounts:
-  - /home
-  - /data
-wait_for_compute_nodes_to_go_down: true
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-ubuntu.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-ubuntu.yml
deleted file mode 100644
index e104f5ede2..0000000000
--- a/tools/cloud-build/daily-tests/tests/slurm-v5-ubuntu.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-test_name: hpc-slurm-ubuntu2004
-deployment_name: "ubun-v5-{{ build }}"
-# Manually adding the slurm_cluster_name for use in node names, which filters
-# non-alphanumeric chars and is capped at 10 chars.
-slurm_cluster_name: "ubunv5{{ build[0:4] }}"
-zone: us-west4-c
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml"
-network: "{{ deployment_name }}-net"
-# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work.
-login_node: "{{ slurm_cluster_name }}-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-custom_vars:
-  partitions:
-  - compute
-  - debug
-  mounts:
-  - /home
-cli_deployment_vars:
-  network_name: "{{ network }}"
-wait_for_compute_nodes_to_go_down: true

From d56d8a276b35f405a94dcd8f60c78624cb439471 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:31:31 +0000
Subject: [PATCH 084/140] Bump github.com/go-git/go-billy/v5 from 5.6.0 to
 5.6.1

Bumps [github.com/go-git/go-billy/v5](https://github.com/go-git/go-billy) from 5.6.0 to 5.6.1.
- [Release notes](https://github.com/go-git/go-billy/releases)
- [Commits](https://github.com/go-git/go-billy/compare/v5.6.0...v5.6.1)

---
updated-dependencies:
- dependency-name: github.com/go-git/go-billy/v5
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 go.mod |  8 ++++----
 go.sum | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/go.mod b/go.mod
index 8e9d4e4d7c..6e9502407e 100644
--- a/go.mod
+++ b/go.mod
@@ -22,7 +22,7 @@ require (
 
 require (
 	github.com/fatih/color v1.18.0
-	github.com/go-git/go-billy/v5 v5.6.0
+	github.com/go-git/go-billy/v5 v5.6.1
 	github.com/google/go-cmp v0.6.0
 	github.com/hashicorp/terraform-exec v0.21.0
 	github.com/mattn/go-isatty v0.0.20
@@ -35,7 +35,7 @@ require (
 	cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
 	dario.cat/mergo v1.0.0 // indirect
 	github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
-	github.com/cyphar/filepath-securejoin v0.2.5 // indirect
+	github.com/cyphar/filepath-securejoin v0.3.6 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/go-logr/logr v1.4.1 // indirect
@@ -44,7 +44,7 @@ require (
 	github.com/hashicorp/terraform-json v0.22.1 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/rogpeppe/go-internal v1.11.0 // indirect
+	github.com/rogpeppe/go-internal v1.12.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
 	go.opentelemetry.io/otel v1.24.0 // indirect
@@ -96,7 +96,7 @@ require (
 	github.com/xanzy/ssh-agent v0.3.3 // indirect
 	go.opencensus.io v0.24.0 // indirect
 	golang.org/x/crypto v0.31.0 // indirect
-	golang.org/x/net v0.27.0 // indirect
+	golang.org/x/net v0.33.0 // indirect
 	golang.org/x/oauth2 v0.21.0 // indirect
 	golang.org/x/sys v0.28.0
 	golang.org/x/text v0.21.0 // indirect
diff --git a/go.sum b/go.sum
index bdf95899c5..f976fd23a8 100644
--- a/go.sum
+++ b/go.sum
@@ -231,8 +231,8 @@ github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWH
 github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
 github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/cyphar/filepath-securejoin v0.2.5 h1:6iR5tXJ/e6tJZzzdMc1km3Sa7RRIVBKAK32O2s7AYfo=
-github.com/cyphar/filepath-securejoin v0.2.5/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
+github.com/cyphar/filepath-securejoin v0.3.6 h1:4d9N5ykBnSp5Xn2JkhocYDkOpURL/18CYMpo6xB9uWM=
+github.com/cyphar/filepath-securejoin v0.3.6/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
@@ -261,8 +261,8 @@ github.com/gliderlabs/ssh v0.3.7 h1:iV3Bqi942d9huXnzEF2Mt+CY9gLu8DNM4Obd+8bODRE=
 github.com/gliderlabs/ssh v0.3.7/go.mod h1:zpHEXBstFnQYtGnB8k8kQLol82umzn/2/snG7alWVD8=
 github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI=
 github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic=
-github.com/go-git/go-billy/v5 v5.6.0 h1:w2hPNtoehvJIxR00Vb4xX94qHQi/ApZfX+nBE2Cjio8=
-github.com/go-git/go-billy/v5 v5.6.0/go.mod h1:sFDq7xD3fn3E0GOwUSZqHo9lrkmx8xJhA0ZrfvjBRGM=
+github.com/go-git/go-billy/v5 v5.6.1 h1:u+dcrgaguSSkbjzHwelEjc0Yj300NUevrrPphk/SoRA=
+github.com/go-git/go-billy/v5 v5.6.1/go.mod h1:0AsLr1z2+Uksi4NlElmMblP5rPcDZNRCD8ujZCRR2BE=
 github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4=
 github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII=
 github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys=
@@ -458,8 +458,8 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
-github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
-github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
+github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
+github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8=
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
@@ -484,8 +484,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
-github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8=
 github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM=
@@ -619,8 +619,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug
 golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
 golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
 golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
-golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
-golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
+golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=

From 0cd02fce9375d4895c86de9fbbb491d9b87522d6 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Mon, 6 Jan 2025 15:43:19 +0000
Subject: [PATCH 085/140] Remove slurm-gcp v5 examples and update documentation

---
 community/examples/AMD/README.md              |   4 -
 .../examples/AMD/hpc-amd-slurm-v5-legacy.yaml | 231 ---------
 .../hpc-slurm-chromedesktop-v5-legacy.yaml    | 119 -----
 .../hpc-slurm-local-ssd-v5-legacy.yaml        | 109 -----
 .../hpc-slurm-ubuntu2004-v5-legacy.yaml       |  96 ----
 community/examples/htc-slurm-v5-legacy.yaml   | 165 -------
 .../healthcare-and-life-sciences/README.md    |   4 -
 .../hcls-blueprint-v5-legacy.yaml             | 353 --------------
 examples/README.md                            | 455 ------------------
 examples/cae/README.md                        |   4 -
 examples/cae/cae-slurm-v5-legacy.yaml         | 254 ----------
 examples/hpc-enterprise-slurm-v5-legacy.yaml  | 326 -------------
 examples/hpc-slurm-v5-legacy.yaml             | 112 -----
 examples/image-builder-v5-legacy.yaml         | 110 -----
 .../a3-highgpu-8g/v5-legacy/README.md         | 342 -------------
 .../ml-slurm-a3-0-base-v5-legacy.yaml         |  61 ---
 .../ml-slurm-a3-1-image-v5-legacy.yaml        | 283 -----------
 .../ml-slurm-a3-2-cluster-v5-legacy.yaml      | 213 --------
 examples/ml-slurm-v5-legacy.yaml              | 266 ----------
 .../blueprints/lustre-slurm-v5-legacy.yaml    | 151 ------
 .../test_configs/gpu-v5-legacy.yaml           | 189 --------
 .../test_configs/node-groups-v5-legacy.yaml   | 173 -------
 ...lurm-gcp-v5-startup-scripts-v5-legacy.yaml | 123 -----
 .../slurm-static-test-v5-legacy.yaml          | 100 ----
 .../zone-policies-slurm-v5-legacy.yaml        |  94 ----
 25 files changed, 4337 deletions(-)
 delete mode 100644 community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml
 delete mode 100644 community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml
 delete mode 100644 community/examples/hpc-slurm-local-ssd-v5-legacy.yaml
 delete mode 100644 community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
 delete mode 100644 community/examples/htc-slurm-v5-legacy.yaml
 delete mode 100644 docs/videos/healthcare-and-life-sciences/hcls-blueprint-v5-legacy.yaml
 delete mode 100644 examples/cae/cae-slurm-v5-legacy.yaml
 delete mode 100644 examples/hpc-enterprise-slurm-v5-legacy.yaml
 delete mode 100644 examples/hpc-slurm-v5-legacy.yaml
 delete mode 100644 examples/image-builder-v5-legacy.yaml
 delete mode 100644 examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md
 delete mode 100644 examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml
 delete mode 100644 examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml
 delete mode 100644 examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml
 delete mode 100644 examples/ml-slurm-v5-legacy.yaml
 delete mode 100644 tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml
 delete mode 100644 tools/validate_configs/test_configs/gpu-v5-legacy.yaml
 delete mode 100644 tools/validate_configs/test_configs/node-groups-v5-legacy.yaml
 delete mode 100644 tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
 delete mode 100644 tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml
 delete mode 100644 tools/validate_configs/test_configs/zone-policies-slurm-v5-legacy.yaml

diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md
index ffc25e2598..38ccda4442 100644
--- a/community/examples/AMD/README.md
+++ b/community/examples/AMD/README.md
@@ -1,9 +1,5 @@
 # AMD solutions for the Cluster Toolkit (formerly HPC Toolkit)
 
-> [!NOTE]
-> This document uses Slurm-GCP v6. If you want to use Slurm-GCP v5 version you
-> scan refer [blueprint](./hpc-amd-slurm-v5-legacy.yaml)
-
 ## AMD-Optimized Slurm Cluster
 
 This example provisions a Slurm cluster using the AMD-based Computed Optimized
diff --git a/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml b/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml
deleted file mode 100644
index c92044511f..0000000000
--- a/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-blueprint_name: hpc-amd-slurm
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: amd-v5
-  region: us-east4
-  zone: us-east4-c
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: swfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /sw
-
-  - id: spack-setup
-    source: community/modules/scripts/spack-setup
-    settings:
-      install_dir: /sw/spack
-      spack_ref: v0.18.1
-
-  - id: spack-execute
-    source: community/modules/scripts/spack-execute
-    use: [spack-setup]
-    settings:
-      log_file: /var/log/spack.log
-      data_files:
-      - destination: /tmp/projections-config.yaml
-        content: |
-          modules:
-            default:
-              tcl:
-                hash_length: 0
-                all:
-                  conflict:
-                    - '{name}'
-                projections:
-                  all: '{name}/{version}-{compiler.name}-{compiler.version}'
-      - destination: /tmp/slurm-external-config.yaml
-        content: |
-          packages:
-            slurm:
-              externals:
-                - spec: slurm@22-05-8
-                  prefix: /usr/local
-              buildable: False
-      - destination: /sw/spack/openfoam_env.yaml
-        content: |
-          spack:
-            definitions:
-            - compilers:
-              - gcc@10.3.0
-            - mpis:
-              - openmpi@4.1.3+legacylaunchers+pmi fabrics=none schedulers=slurm
-            - packages:
-              - flex@2.6.4
-            - mpi_packages:
-              - openfoam-org@8 ^flex@2.6.4 target=zen3
-            specs:
-            - matrix:
-              - - $mpis
-              - - $%compilers
-            - matrix:
-              - - $packages
-              - - $%compilers
-            - matrix:
-              - - $mpi_packages
-              - - $%compilers
-              - - $^mpis
-            concretizer:
-              unify: when_possible
-      commands: |
-        spack config --scope defaults add config:build_stage:/opt/spack_build_stage
-        spack config --scope defaults add -f /tmp/projections-config.yaml
-        spack config --scope site add -f /tmp/slurm-external-config.yaml
-        spack config --scope site add concretizer:targets:host_compatible:false
-
-        # gcc 12.1.0 is known to have runtime failures with OpenFOAM 8
-        # gcc 10.3.0 is the earliest copy of gcc with Zen 3 support
-        spack install gcc@10.3.0 %gcc@4.8.5 target=x86_64
-        spack load gcc@10.3.0 %gcc@4.8.5 target=x86_64
-        spack compiler find --scope site
-
-        if ! spack env list | grep -q openfoam; then
-          spack env create openfoam /sw/spack/openfoam_env.yaml
-          spack env activate openfoam
-          spack concretize
-          spack install
-        fi
-
-  - id: spack-startup
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - $(spack-execute.spack_runner)
-      - type: shell
-        destination: shutdown.sh
-        content: |
-          #!/bin/bash
-          if [ ! -f /etc/block_auto_shutdown ]; then
-                  touch /etc/block_auto_shutdown
-                  shutdown -h +1
-          fi
-
-  - id: slurm_startup
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - $(spack-setup.spack_runner)
-      # the following installation of AOCC may be automated in the future
-      # with a clear direction to the user to read the EULA at
-      # https://developer.amd.com/aocc-compiler-eula/
-      - type: data
-        destination: /var/tmp/install_aocc.sh
-        content: |
-          #!/bin/bash
-          source /sw/spack/share/spack/setup-env.sh
-          spack install aocc@3.2.0 +license-agreed
-          spack load aocc@3.2.0
-          spack compiler find --scope site
-          spack -d install -v openmpi@4.1.3 %aocc@3.2.0 +legacylaunchers +pmi schedulers=slurm
-      - type: data
-        destination: /var/tmp/openfoam_test.sh
-        content: |
-          #!/bin/bash
-          # the following line works around a problem activating environments
-          # before directory is accessed
-          ls -lha /sw/spack/var/spack/environments/openfoam/ &>/dev/null
-          spack env activate openfoam
-          DIR=$HOME/openfoam_test
-          mkdir -p $DIR
-          cd $DIR
-          cp -fr $WM_PROJECT_DIR/tutorials/incompressible/simpleFoam/motorBike .
-          cd motorBike
-          ./Allrun
-
-  - id: spack_builder
-    source: modules/compute/vm-instance
-    use: [network1, swfs, spack-startup]
-    settings:
-      name_prefix: spack-builder
-      machine_type: c2d-standard-16
-      disable_public_ips: true
-      instance_image:
-        # these images must match the images used by Slurm modules below because
-        # we are building OpenMPI with PMI support in libraries contained in
-        # Slurm installation
-        family: slurm-gcp-5-12-hpc-centos-7
-        project: schedmd-slurm-public
-
-  - id: low_cost_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      machine_type: c2d-standard-4
-      node_count_dynamic_max: 10
-      bandwidth_tier: gvnic_enabled
-
-  - id: low_cost_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - low_cost_node_group
-    settings:
-      partition_name: lowcost
-      enable_placement: false
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      machine_type: c2d-standard-112
-      node_count_dynamic_max: 50
-      bandwidth_tier: gvnic_enabled
-
-  # because is_default is set to true, jobs will run on this partition unless an
-  # alternative partition is specified using, for example, "srun -p lowcost"
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - compute_node_group
-    settings:
-      partition_name: compute
-      enable_placement: true
-      is_default: true
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - swfs
-    - low_cost_partition
-    - compute_partition
-    settings:
-      machine_type: c2d-standard-4
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    - slurm_startup
-    settings:
-      # need at least 8 physical cores to run OpenFOAM test
-      machine_type: c2d-standard-16
diff --git a/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml b/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml
deleted file mode 100644
index 1a223d55cc..0000000000
--- a/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: slurm-crd
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-crd-01
-  region: us-central1
-  zone: us-central1-c
-  instance_image_crd:
-    family: slurm-gcp-5-12-debian-11
-    project: schedmd-slurm-public
-  instance_image:
-    family: slurm-gcp-5-12-hpc-centos-7
-    project: schedmd-slurm-public
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: remote-desktop
-    source: community/modules/remote-desktop/chrome-remote-desktop
-    use: [network1]
-    settings:
-      install_nvidia_driver: true
-      # instance_count: 0 will create installation scripts only
-      # which can be used with slurm node provisioning
-      instance_count: 0
-
-  - id: crd_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      machine_type: n1-standard-8
-      node_count_dynamic_max: 3
-      disable_public_ips: false
-      instance_image: $(vars.instance_image_crd)
-      instance_image_custom: true
-      guest_accelerator:
-      - type: nvidia-tesla-t4-vws
-        count: 1
-
-  - id: crd_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - remote-desktop
-    - crd_node_group
-    settings:
-      partition_name: desktop
-      enable_placement: false
-      partition_startup_scripts_timeout: 900
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      machine_type: n2d-standard-16
-      node_count_dynamic_max: 20
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - compute_node_group
-    settings:
-      partition_name: compute
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - crd_partition
-    - compute_partition
-    settings:
-      disable_controller_public_ips: false
-      compute_startup_scripts_timeout: 900
-      cloud_parameters:
-        resume_rate: 0
-        resume_timeout: 900
-        suspend_rate: 0
-        suspend_timeout: 300
-        no_comma_params: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2d-standard-4
-      disable_login_public_ips: false
diff --git a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml b/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml
deleted file mode 100644
index 08e39819b7..0000000000
--- a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hpc-slurm-local-ssd
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: hpc-localssd
-  region: us-central1
-  zone: us-central1-a
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/pre-existing-vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: startup
-    source: modules/scripts/startup-script
-    settings:
-      # When shutting down a VM with local SSD disks, we strongly recommend the
-      # automatic migration of data following these instructions:
-      # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance
-      # Failure to do will result in VMs that lose data and do not automatically
-      # mount local SSD filesystems
-      local_ssd_filesystem:
-        fs_type: ext4
-        mountpoint: /mnt/localssd
-        permissions: "1777" # must quote numeric filesystem permissions!
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      additional_disks:
-      - device_name: test-disk-1
-        disk_name: null
-        disk_size_gb: 375
-        disk_type: local-ssd
-        disk_labels: {}
-        auto_delete: true
-        boot: false
-      - device_name: test-disk-2
-        disk_name: null
-        disk_size_gb: 375
-        disk_type: local-ssd
-        disk_labels: {}
-        auto_delete: true
-        boot: false
-      bandwidth_tier: gvnic_enabled
-      machine_type: c2-standard-4
-      node_count_dynamic_max: 5
-      node_count_static: 0
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - compute_node_group
-    - startup
-    settings:
-      is_default: true
-      partition_name: ssdcomp
-      region: us-central1
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - compute_partition
-    settings:
-      cloud_parameters:
-        resume_rate: 0
-        resume_timeout: 300
-        suspend_rate: 0
-        suspend_timeout: 300
-        no_comma_params: false
-      machine_type: n1-standard-4
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n1-standard-4
diff --git a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
deleted file mode 100644
index 916fcde74b..0000000000
--- a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hpc-slurm-ubuntu2004
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-gcp-v5
-  region: us-west4
-  zone: us-west4-c
-  instance_image:
-    # Please refer to the following link for the latest images:
-    # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
-    family: slurm-gcp-5-12-ubuntu-2004-lts
-    project: schedmd-slurm-public
-  instance_image_custom: true
-
-
-deployment_groups:
-- group: primary
-  modules:
-  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
-  # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: debug_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-
-  - id: debug_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - debug_node_group
-    settings:
-      partition_name: debug
-      exclusive: false # allows nodes to stay up after jobs are done
-      enable_placement: false # the default is: true
-      is_default: true
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - compute_node_group
-    settings:
-      partition_name: compute
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - debug_partition
-    - compute_partition
-    - homefs
-    settings:
-      disable_controller_public_ips: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-4
-      disable_login_public_ips: false
diff --git a/community/examples/htc-slurm-v5-legacy.yaml b/community/examples/htc-slurm-v5-legacy.yaml
deleted file mode 100644
index 1089cf9904..0000000000
--- a/community/examples/htc-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2022 Google LLC
-# Copyright (C) SchedMD LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-# This blueprint provisions a cluster using the Slurm scheduler configured to
-# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
-# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
-# https://slurm.schedmd.com/high_throughput.html
-
-blueprint_name: htc-slurm
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: htc-slurm
-  region: us-west4
-  zone: us-west4-c
-  # By default, public IPs are set in the login and controller to allow easier
-  # SSH access. To turn this behavior off, set this to true.
-  disable_public_ips: false
-  # Stage `community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/*` into the deployment folder.
-  # If you move the blueprint, make sure the relative path is correct.
-  staged_configs: $(ghpc_stage("../modules/scheduler/schedmd-slurm-gcp-v5-controller/etc"))
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
-  # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: projectsfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_tier: HIGH_SCALE_SSD
-      size_gb: 10240
-      local_mount: /projects
-
-  # This file system has an associated license cost.
-  # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
-  - id: scratchfs
-    source: community/modules/file-system/DDN-EXAScaler
-    use: [network1]
-    settings:
-      local_mount: /scratch
-
-  # The compute partition is designed for performance.
-  # Use:
-  # `srun -N 4 -p compute <<Command>>` for any node in the partition.
-  # `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
-
-  - id: compute_node_group_c2s60
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: c2s60
-      node_count_dynamic_max: 200
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_node_group_c2s30
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: c2s30
-      node_count_dynamic_max: 200
-      machine_type: c2-standard-30
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - scratchfs
-    - projectsfs
-    - compute_node_group_c2s60
-    - compute_node_group_c2s30
-    settings:
-      partition_name: compute
-      enable_placement: false
-      exclusive: false
-
-  # The lowcost partition is designed to run at a lower cost and without additional quota
-  # Use:
-  # `srun -N 4 <<Command>>` for any node in the partition.
-  # `srun -N 4 --mincpus 2` for node group n2s4.
-  - id: low_cost_node_group_n2s2
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: n2s2
-      machine_type: n2-standard-2
-      node_count_dynamic_max: 10
-      bandwidth_tier: gvnic_enabled
-
-  - id: low_cost_node_group_n2s4
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: n2s4
-      machine_type: n2-standard-4
-      node_count_dynamic_max: 10
-      bandwidth_tier: gvnic_enabled
-
-  - id: low_cost_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - scratchfs
-    - projectsfs
-    - low_cost_node_group_n2s2
-    - low_cost_node_group_n2s4
-    settings:
-      is_default: true
-      partition_name: lowcost
-      enable_placement: false
-      exclusive: false
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - scratchfs
-    - projectsfs
-    - low_cost_partition
-    - compute_partition
-    settings:
-      machine_type: c2-standard-8
-      disable_controller_public_ips: $(vars.disable_public_ips)
-      slurm_conf_tpl: $(vars.staged_configs)/htc-slurm.conf.tpl
-      slurmdbd_conf_tpl: $(vars.staged_configs)/htc-slurmdbd.conf.tpl
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-4
-      disable_login_public_ips: $(vars.disable_public_ips)
-
-  - id: hpc_dashboard
-    source: modules/monitoring/dashboard
-    outputs: [instructions]
diff --git a/docs/videos/healthcare-and-life-sciences/README.md b/docs/videos/healthcare-and-life-sciences/README.md
index de76730b9c..3bd9d7949f 100644
--- a/docs/videos/healthcare-and-life-sciences/README.md
+++ b/docs/videos/healthcare-and-life-sciences/README.md
@@ -1,9 +1,5 @@
 # Healthcare and Life Science Blueprint
 
-> [!NOTE]
-> This document uses SlurmGCP v6 version of hcls blueprint. If you want to
-> use SlurmGCP v5 version, please refer to this [blueprint](./hcls-blueprint-v5-legacy.yaml).
-
 This folder captures an advanced architecture that can be used to run GROMACS
 with GPUs or CPUs on Google Cloud.
 
diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint-v5-legacy.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint-v5-legacy.yaml
deleted file mode 100644
index 7df8f8ee5a..0000000000
--- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint-v5-legacy.yaml
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hcls-cluster-v5
-
-validators:
-- validator: test_apis_enabled
-  skip: true # skipping this validator, since "service-enablement" will take care of it.
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: hcls-01
-  region: us-central1
-  zone: us-central1-c
-  bucket_force_destroy: false
-
-deployment_groups:
-- group: enable_apis
-  modules:
-
-  ### Enable APIs ###
-
-  - id: services-api
-    source: community/modules/project/service-enablement
-    settings:
-      gcp_service_list:
-      - file.googleapis.com
-      - iam.googleapis.com
-      - pubsub.googleapis.com
-      - secretmanager.googleapis.com
-      - serviceusage.googleapis.com
-      - compute.googleapis.com
-      - stackdriver.googleapis.com
-
-- group: setup
-  modules:
-
-  ### Network ###
-
-  - id: network1
-    source: modules/network/vpc
-
-  ### Resource Monitoring ###
-
-  - id: hpc-dash
-    source: modules/monitoring/dashboard
-
-  ### Storage ###
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_share_name: homeshare
-      local_mount: /home
-
-  - id: appsfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_share_name: appsshare
-      local_mount: /apps
-
-  - id: bucket-software
-    source: community/modules/file-system/cloud-storage-bucket
-    settings:
-      name_prefix: hcls-user-provided-software
-      random_suffix: true
-      local_mount: /user_provided_software
-      force_destroy: $(vars.bucket_force_destroy)
-    outputs: [gcs_bucket_path]
-
-  - id: bucket-input
-    source: community/modules/file-system/cloud-storage-bucket
-    settings:
-      name_prefix: hcls-inputs
-      random_suffix: true
-      local_mount: /data_input
-      mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766
-      force_destroy: $(vars.bucket_force_destroy)
-
-  - id: bucket-output
-    source: community/modules/file-system/cloud-storage-bucket
-    settings:
-      name_prefix: hcls-outputs
-      random_suffix: true
-      local_mount: /data_output
-      mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766
-      force_destroy: $(vars.bucket_force_destroy)
-
-- group: software_installation
-  modules:
-
-  ### Software ###
-
-  - id: spack-setup
-    source: community/modules/scripts/spack-setup
-    settings:
-      install_dir: /apps/spack
-
-  - id: spack-execute
-    source: community/modules/scripts/spack-execute
-    use: [spack-setup]
-    settings:
-      data_files:
-      - destination: /tmp/projections-config.yaml
-        content: |
-          modules:
-            default:
-              tcl:
-                hash_length: 0
-                all:
-                  conflict:
-                    - '{name}'
-                projections:
-                  all: '{name}/{version}-{compiler.name}-{compiler.version}'
-      - destination: /tmp/slurm-external-config.yaml
-        content: |
-          packages:
-            slurm:
-              externals:
-                - spec: slurm@21-08-8-2
-                  prefix: /usr/local
-              buildable: False
-      - destination: /share/spack/gromacs_env.yaml
-        content: |
-          spack:
-            definitions:
-            - compilers:
-                - gcc@11.3.0
-            - cudas:
-                - cuda@11.8.0
-            - cuda_mpis:
-                - openmpi@4.1.4+cuda
-            - mpi_cuda_packages:
-                - gromacs@2022.3+cuda+mpi
-            specs:
-            - $compilers
-            - matrix:
-                - [$cudas]
-                - [$%compilers]
-            - matrix:
-                - [$cuda_mpis]
-                - [$%compilers]
-                - [$^cudas]
-                - [target=skylake]
-            - matrix:
-                - [$mpi_cuda_packages]
-                - [$^cudas]
-                - [$^cuda_mpis]
-                - [$%compilers]
-                - [target=skylake]
-      commands: |
-        spack config --scope defaults add config:build_stage:/apps/spack/spack-stage
-        spack config --scope defaults add -f /tmp/projections-config.yaml
-        spack config --scope site add -f /tmp/slurm-external-config.yaml
-
-        NVCC_PREPEND_FLAGS='-arch=all'
-        spack install gcc@11.3.0 target=x86_64
-        spack load gcc@11.3.0 target=x86_64
-        spack compiler find --scope site
-
-        if ! spack env list | grep -q gromacs; then
-          spack env create gromacs /share/spack/gromacs_env.yaml
-          spack env activate gromacs
-          spack concretize
-          spack install
-        fi
-
-  - id: spack-builder-startup
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - $(spack-execute.spack_runner)
-
-      - type: shell
-        destination: data_staging.sh
-        content: |
-          #!/bin/bash
-          wget --no-verbose -P /data_input/protein_data_bank/ https://files.rcsb.org/download/1AKI.pdb
-          wget --no-verbose -P /tmp/ https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz && \
-            mkdir -p /data_input/gromacs_inputs/ && \
-            tar xzf /tmp/water_GMX50_bare.tar.gz -C /data_input/gromacs_inputs/ && \
-            rm /tmp/water_GMX50_bare.tar.gz
-
-          # Set permissions for Spack environment
-          chmod -R a+rwX /apps/spack/var/spack/environments/gromacs
-
-      - type: data
-        destination: /apps/gromacs/submit_gromacs_water_cpu.sh
-        content: |
-          #!/bin/bash
-          #SBATCH -N 1
-          #SBATCH --ntasks-per-node 30
-          #SBATCH -p compute
-
-          # Size can be 0000.65  0000.96  0001.5  0003  0006  0012  0024  0048  0096  0192  0384  0768  1536  3072
-          # Type can be 'pme' or 'rf'
-
-          source /apps/spack/share/spack/setup-env.sh
-          spack env activate gromacs
-
-          # Check that gmx_mpi exists
-          which gmx_mpi
-          cd $SLURM_SUBMIT_DIR
-          cp /data_input/gromacs_inputs/water-cut1.0_GMX50_bare/1536/* .
-          mpirun -n 1 gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr
-          mpirun -n 30 gmx_mpi mdrun -notunepme -dlb yes -v -resethway -noconfout -nsteps 4000 -s input.tpr
-
-      - type: data
-        destination: /apps/gromacs/submit_gromacs_water_gpu.sh
-        content: |
-          #!/bin/bash
-          #SBATCH -N 1
-          #SBATCH --ntasks-per-node 1
-          #SBATCH -p gpu
-          #SBATCH --gpus 1
-
-          # Size can be 0000.65  0000.96  0001.5  0003  0006  0012  0024  0048  0096  0192  0384  0768  1536  3072
-          # Type can be 'pme' or 'rf'
-
-          source /apps/spack/share/spack/setup-env.sh
-          spack env activate gromacs
-
-          # Check that gmx_mpi exists
-          which gmx_mpi
-          cd $SLURM_SUBMIT_DIR
-          cp /data_input/gromacs_inputs/water-cut1.0_GMX50_bare/1536/* .
-
-          # Significant GPU Optimizations only support constraints=h-bonds
-          # so we change this here for the water benchmark.
-          for a in *.mdp; do
-              sed -i 's/constraints[[:blank:]].*=.*all-bonds.*/constraints = h-bonds/' $a
-          done
-          mpirun -n 1 gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr
-
-          mpirun -n 1 -H localhost \
-            env GMX_ENABLE_DIRECT_GPU_COMM=1 \
-            gmx_mpi mdrun -v -nsteps 100000 -resetstep 90000 -noconfout \
-            -pme gpu -update gpu -nb gpu -gputasks 00 -s input.tpr
-
-      - type: shell
-        destination: shutdown.sh
-        content: |
-          #!/bin/bash
-          if [ ! -f /etc/block_auto_shutdown ]; then
-                  touch /etc/block_auto_shutdown
-                  shutdown -h +1
-          fi
-
-  - id: spack-builder
-    source: modules/compute/vm-instance
-    use: [network1, appsfs, bucket-input, spack-builder-startup]
-    settings:
-      name_prefix: spack-builder
-      add_deployment_name_before_prefix: true
-      threads_per_core: 2
-      machine_type: c2-standard-16
-
-- group: cluster
-  modules:
-
-  ### Remote Desktop ###
-
-  - id: desktop
-    source: community/modules/remote-desktop/chrome-remote-desktop
-    use:
-    - network1
-    - homefs
-    - appsfs
-    - bucket-input
-    - bucket-output
-    - bucket-software
-    settings:
-      add_deployment_name_before_prefix: true
-      name_prefix: chrome-remote-desktop
-      install_nvidia_driver: true
-      startup_script: |
-        find /user_provided_software -name vmd-1.9.*.bin.LINUXAMD64*.tar.gz -exec tar xvzf '{}' -C . \;
-        cd vmd-1.9.*/
-        ./configure
-        cd src/
-        sudo make install
-
-  ### Slurm Cluster ###
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      machine_type: c2-standard-60
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - compute_node_group
-    - homefs
-    - appsfs
-    - bucket-input
-    - bucket-output
-    settings:
-      partition_name: compute
-
-  - id: gpu_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      enable_smt: true
-      node_count_dynamic_max: 20
-      machine_type: a2-highgpu-1g
-
-  - id: gpu_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - gpu_node_group
-    - homefs
-    - appsfs
-    - bucket-input
-    - bucket-output
-    settings:
-      partition_name: gpu
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - compute_partition
-    - gpu_partition
-    - homefs
-    - appsfs
-    - bucket-input
-    - bucket-output
-    settings:
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
diff --git a/examples/README.md b/examples/README.md
index 92031bf130..9890286175 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -17,34 +17,26 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
   * [(Optional) Setting up a remote terraform state](#optional-setting-up-a-remote-terraform-state)
 * [Completed Migration to Slurm-GCP v6](#completed-migration-to-slurm-gcp-v6)
 * [Blueprint Descriptions](#blueprint-descriptions)
-  * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge]
-  * [hpc-enterprise-slurm-v5-legacy.yaml](#hpc-enterprise-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge]
   * [hpc-slurm-static.yaml](#hpc-slurm-staticyaml-) ![core-badge]
   * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm6-tpu-maxtext.yaml](#hpc-slurm6-tpu-maxtextyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm6-apptainer.yaml](#hpc-slurm6-apptaineryaml--) ![community-badge] ![experimental-badge]
-  * [ml-slurm-v5-legacy.yaml](#ml-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge]
-  * [image-builder-v5-legacy.yaml](#image-builder-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [image-builder.yaml](#image-builderyaml-) ![core-badge]
   * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge]
   * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge]
   * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge]
   * [ps-slurm.yaml](#ps-slurmyaml--) ![core-badge] ![experimental-badge]
   * [pfs-parallelstore.yaml](#pfs-parallelstoreyaml--) ![core-badge] ![experimental-badge]
-  * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge]
   * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml--) ![community-badge] ![deprecated-badge]
   * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml--) ![community-badge]
-  * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml--) ![community-badge] ![deprecated-badge]
   * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge]
   * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml--) ![community-badge] ![experimental-badge]
   * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-local-ssd-v5-legacy.yaml](#hpc-slurm-local-ssd-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [hpc-slurm-local-ssd.yaml](#hpc-slurm-local-ssdyaml--) ![community-badge] ![experimental-badge]
   * [hcls-blueprint.yaml](#hcls-blueprintyaml-) ![core-badge]
   * [hpc-gke.yaml](#hpc-gkeyaml--) ![core-badge] ![experimental-badge]
@@ -52,14 +44,12 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
   * [storage-gke](#storage-gkeyaml--) ![core-badge] ![experimental-badge]
   * [gke-a3-megagpu](#gke-a3-megagpuyaml--) ![core-badge] ![experimental-badge]
   * [gke-a3-highgpu](#gke-a3-highgpuyaml--) ![core-badge] ![experimental-badge]
-  * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge]
   * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge]
   * [fsi-montecarlo-on-batch.yaml](#fsi-montecarlo-on-batchyaml-) ![community-badge] ![experimental-badge]
   * [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge]
   * [tutorial-starccm.yaml](#tutorial-starccmyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm-ramble-gromacs.yaml](#hpc-slurm-ramble-gromacsyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-chromedesktop-v5-legacy.yaml](#hpc-slurm-chromedesktop-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge]
   * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge]
   * [omnia-cluster.yaml](#omnia-clusteryaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
@@ -208,65 +198,6 @@ Toolkit team, partners, etc.) and are labeled with the community badge
 Blueprints that are still in development and less stable are also labeled with
 the experimental badge (![experimental-badge]).
 
-### [hpc-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
-
-> **Warning**: The variables `enable_reconfigure`,
-> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to
-> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**.
->
-> ```shell
-> # Install Python3 and run
-> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt
-> ```
-
-Creates a basic auto-scaling Slurm cluster with mostly default settings. The
-blueprint also creates a new VPC network, and a filestore instance mounted to
-`/home`.
-
-There are 3 partitions in this example: `debug` `compute`, and `h3`. The `debug`
-partition uses `n2-standard-2` VMs, which should work out of the box without
-needing to request additional quota. The purpose of the `debug` partition is to
-make sure that first time users are not immediately blocked by quota
-limitations.
-
-[hpc-slurm-v5-legacy.yaml]: ./hpc-slurm-v5-legacy.yaml
-
-#### Compute Partition
-
-There is a `compute` partition that achieves higher performance. Any
-performance analysis should be done on the `compute` partition. By default it
-uses `c2-standard-60` VMs with placement groups enabled. You may need to request
-additional quota for `C2 CPUs` in the region you are deploying in. You can
-select the compute partition using the `-p compute` argument when running `srun`.
-
-#### H3 Partition
-
-There is an `h3` partition that uses compute-optimized `h3-standard-88` machine type.
-You can read more about the H3 machine series [here](https://cloud.google.com/compute/docs/compute-optimized-machines#h3_series).
-
-#### Quota Requirements for hpc-slurm-v5-legacy.yaml
-
-For this example the following is needed in the selected region:
-
-* Cloud Filestore API: Basic HDD (Standard) capacity (GB): **1,024 GB**
-* Compute Engine API: Persistent Disk SSD (GB): **~50 GB**
-* Compute Engine API: Persistent Disk Standard (GB): **~50 GB static + 50
-  GB/node** up to 1,250 GB
-* Compute Engine API: N2 CPUs: **2** for the login node and **2/node** active
-  in the `debug` partition up to 12
-* Compute Engine API: C2 CPUs: **4** for the controller node and **60/node**
-  active in the `compute` partition up to 1,204
-* Compute Engine API: H3 CPUs: **88/node** active in the `h3` partition up to
-  1760
-  * The H3 CPU quota can be increased on the Cloud Console by navigating to
-  `IAM & Admin`->`Quotas` or searching `All Quotas` and entering `vm_family:H3`
-  into the filter bar.  From there, the quotas for each region may be selected
-  and edited.
-* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only
-  needed for the `compute` partition_
-* Compute Engine API: Resource policies: **one for each job in parallel** -
-  _only needed for the `compute` partition_
-
 ### [hpc-slurm.yaml] ![core-badge]
 
 Creates a basic auto-scaling Slurm cluster with mostly default settings. The
@@ -317,96 +248,6 @@ For this example the following is needed in the selected region:
 * Compute Engine API: Resource policies: **one for each job in parallel** -
   _only needed for the `compute` partition_
 
-### [hpc-enterprise-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
-
-This advanced blueprint creates a cluster with Slurm with several performance
-tunings enabled, along with tiered file systems for higher performance. Some of
-these features come with additional cost and required additional quotas.
-
-The Slurm system deployed here connects to the default VPC of the project and
-creates a  login node and the following seven partitions:
-
-* `n2` with general-purpose [`n2-stardard-2` nodes][n2]. Placement policies and
-exclusive usage are disabled, which means the nodes can be used for multiple jobs.
-Nodes will remain idle for 5 minutes before Slurm deletes them. This partition can
-be used for debugging and workloads that do not require high performance.
-* `c2` with compute-optimized [`c2-standard-60` nodes][c2] based on Intel 3.9 GHz
-Cascade Lake processors.
-* `c2d` with compute optimized [`c2d-standard-112` nodes][c2d] base on the third
-generation AMD EPYC Milan.
-* `c3` with compute-optimized [`c3-highcpu-176` nodes][c3] based on Intel Sapphire
-Rapids processors. When configured with Tier_1 networking, C3 nodes feature 200 Gbps
-low-latency networking.
-* `h3` with compute-optimized [`h3-standard-88` nodes][h3]  based on Intel Sapphire
-Rapids processors. H3 VMs can use the entire host network bandwidth and come with a default network bandwidth rate of up to 200 Gbps.
-* `a208` with [`a2-ultragpu-8g` nodes][a2] with 8 of the NVIDIA A100 GPU accelerators
-with 80GB of GPU memory each.
-* `a216` with [`a2-megagpu-16g` nodes][a2] with 16 of the NVIDIA A100 GPU accelerators
-with 40GB of GPU memory each.
-
-For all partitions other than `n2`, [compact placement] policies are enabled by default
-and nodes are created and destroyed on a per-job basis. Furthermore, these partitions
-are configured with:
-
-* Faster networking: Google Virtual NIC ([GVNIC]) is used for the GPU partitions and
-[Tier_1] is selected when available. Selecting Tier_1 automatically enables GVNIC.
-* SSD PDs disks for compute nodes. See the [Storage options] page for more details.
-
-[n2]: https://cloud.google.com/compute/docs/general-purpose-machines#n2_series
-[c2]: https://cloud.google.com/compute/docs/compute-optimized-machines#c2_machine_types
-[c2d]: https://cloud.google.com/compute/docs/compute-optimized-machines#c2d_machine_types
-[c3]: https://cloud.google.com/blog/products/compute/introducing-c3-machines-with-googles-custom-intel-ipu
-[h3]: https://cloud.google.com/compute/docs/compute-optimized-machines#h3_series
-[a2]: https://cloud.google.com/compute/docs/gpus#a100-gpus
-[g2]: https://cloud.google.com/compute/docs/gpus#l4-gpus
-[compact placement]: https://cloud.google.com/compute/docs/instances/define-instance-placement
-[GVNIC]: https://cloud.google.com/compute/docs/networking/using-gvnic
-[Tier_1]: https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration
-[Storage options]: https://cloud.google.com/compute/docs/disks
-
-File systems:
-
-* The homefs mounted at `/home` uses the "BASIC_SSD" tier filestore with
-  2.5 TiB of capacity
-* The projectsfs is mounted at `/projects` and is a high scale SSD filestore
-  instance with 10TiB of capacity.
-* The scratchfs is mounted at `/scratch` and is a
-  [DDN Exascaler Lustre](../community/modules/file-system/DDN-EXAScaler/README.md)
-  file system designed for high IO performance. The capacity is ~10TiB.
-
-> **Warning**: The DDN Exascaler Lustre file system has a license cost as
-> described in the pricing section of the
-> [DDN EXAScaler Cloud Marketplace Solution](https://console.developers.google.com/marketplace/product/ddnstorage/).
-
-#### Quota Requirements for hpc-enterprise-slurm-v5-legacy.yaml
-
-For this example the following is needed in the selected region:
-
-* Cloud Filestore API: Basic SSD capacity (GB) per region: **2,560 GB**
-* Cloud Filestore API: High Scale SSD capacity (GB) per region: **10,240 GiB** -
-  _min quota request is 61,440 GiB_
-* Compute Engine API: Persistent Disk SSD (GB): **~14,050 GB** static +
-  **100 GB/node** up to 23,250 GB
-* Compute Engine API: Persistent Disk Standard (GB): **~396 GB** static +
-  **50 GB/node** up to 596 GB
-* Compute Engine API: N2 CPUs: **116** for login and lustre and **2/node** active
- in `n2` partition up to 124.
-* Compute Engine API: C2 CPUs: **4** for controller node and **60/node** active
-  in `c2` partition up to 1,204
-* Compute Engine API: C2D CPUs: **112/node** active in `c2d` partition up to 2,240
-* Compute Engine API: C3 CPUs: **176/node** active in `c3` partition up to 3,520
-* Compute Engine API: H3 CPUs: **88/node** active in `h3` partition up to 1,408
-* Compute Engine API: A2 CPUs: **96/node** active in `a208` and `a216` partitions
-up to 3,072
-* Compute Engine API: NVIDIA A100 80GB GPUs: **8/node** active in `a208` partition
- up to 128
-* Compute Engine API: NVIDIA A100 GPUs: **8/node** active in `a216` partition up
-to 256
-* Compute Engine API: Resource policies: **one for each job in parallel** -
-  _not needed for `n2` partition_
-
-[hpc-enterprise-slurm-v5-legacy.yaml]: ./hpc-enterprise-slurm-v5-legacy.yaml
-
 ### [hpc-enterprise-slurm.yaml] ![core-badge]
 
 This advanced blueprint creates a cluster with Slurm with several performance
@@ -549,55 +390,6 @@ This blueprint creates a custom [Apptainer](https:https://apptainer.org) enabled
 
 [hpc-slurm6-apptainer.yaml]: ../community/examples/hpc-slurm6-apptainer.yaml
 
-### [ml-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
-
-This blueprint provisions an HPC cluster running the Slurm scheduler with the
-machine learning frameworks PyTorch and TensorFlow pre-installed on every
-VM. The cluster has 2 partitions:
-
-* [A2 family VMs][a2] with the NVIDIA A100 GPU accelerator
-* [G2 family VMs][g2] with the NVIDIA L4 GPU accelerator
-
-[a2]: https://cloud.google.com/compute/docs/gpus#a100-gpus
-[g2]: https://cloud.google.com/compute/docs/gpus#l4-gpus
-
-To provision the cluster, please run:
-
-```text
-./gcluster create examples/ml-slurm-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}"
-./gcluster deploy ml-example
-```
-
-After accessing the login node, you can activate the conda environment for each
-library with:
-
-```shell
-source /etc/profile.d/conda.sh
-# to activate PyTorch
-conda activate pytorch
-# to activate TensorFlow
-conda activate tf
-```
-
-An example benchmarking job for PyTorch can be run under Slurm:
-
-```shell
-cp /var/tmp/torch_test.* .
-sbatch -N 1 --gpus-per-node=1 torch_test.sh
-```
-
-When you are done, clean up the resources in reverse order of creation:
-
-```text
-./gcluster destroy ml-example
-```
-
-Finally, browse to the [Cloud Console][console-images] to delete your custom
-image. It will be named beginning with `ml-slurm` followed by a date and
-timestamp for uniqueness.
-
-[ml-slurm-v5-legacy.yaml]: ../examples/ml-slurm-v5-legacy.yaml
-
 ### [ml-slurm.yaml] ![core-badge]
 
 This blueprint provisions an HPC cluster running the Slurm scheduler with the
@@ -647,131 +439,6 @@ timestamp for uniqueness.
 
 [ml-slurm.yaml]: ../examples/ml-slurm.yaml
 
-### [image-builder-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
-
-This blueprint uses the [Packer template module][pkr] to create a custom VM
-image and uses it to provision an HPC cluster using the Slurm scheduler. By
-using a custom image, the cluster is able to begin running jobs sooner and more
-reliably because there is no need to install applications as VMs boot. This
-example takes the following steps:
-
-1. Creates a network with outbound internet access in which to build the image (see
-[Custom Network](#custom-network-deployment-group-1)).
-2. Creates a script that will be used to customize the image (see
-[Toolkit Runners](#toolkit-runners-deployment-group-1)).
-3. Builds a custom Slurm image by executing the script on a standard Slurm image
-(see [Packer Template](#packer-template-deployment-group-2)).
-4. Deploys a Slurm cluster using the custom image (see
-[Slurm Cluster Based on Custom Image](#slurm-cluster-based-on-custom-image-deployment-group-3)).
-
-#### Building and using the custom image
-
-Create the deployment folder from the blueprint:
-
-```text
-./gcluster create examples/image-builder-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}"
-./gcluster deploy image-builder-001"
-```
-
-Follow the on-screen prompts to approve the creation of each deployment group.
-For example, the network is created in the first deployment group, the VM image
-is created in the second group, and the third group uses the image to create an
-HPC cluster using the Slurm scheduler.
-
-When you are done, clean up the resources in reverse order of creation:
-
-```text
-terraform -chdir=image-builder-001/cluster destroy --auto-approve
-terraform -chdir=image-builder-001/primary destroy --auto-approve
-```
-
-Finally, browse to the [Cloud Console][console-images] to delete your custom
-image. It will be named beginning with `my-slurm-image` followed by a date and
-timestamp for uniqueness.
-
-[console-images]: https://console.cloud.google.com/compute/images
-
-#### Why use a custom image?
-
-Using a custom VM image can be more scalable and reliable than installing
-software using boot-time startup scripts because:
-
-* it avoids reliance on continued availability of package repositories
-* VMs will join an HPC cluster and execute workloads more rapidly due to reduced
-  boot-time configuration
-* machines are guaranteed to boot with a static software configuration chosen
-  when the custom image was created. No potential for some machines to have
-  different software versions installed due to `apt`/`yum`/`pip` installations
-  executed after remote repositories have been updated.
-
-[hpcimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm
-[pkr]: ../modules/packer/custom-image/README.md
-[image-builder-v5-legacy.yaml]: ./image-builder-v5-legacy.yaml
-
-#### Custom Network (deployment group 1)
-
-A tool called [Packer](https://packer.io) builds custom VM images by creating
-short-lived VMs, executing scripts on them, and saving the boot disk as an
-image that can be used by future VMs. The short-lived VM typically operates in a
-network that has outbound access to the internet for downloading software.
-
-This deployment group creates a network using [Cloud Nat][cloudnat] and
-[Identity-Aware Proxy (IAP)][iap] to allow outbound traffic and inbound SSH
-connections without exposing the machine to the internet on a public IP address.
-
-[cloudnat]: https://cloud.google.com/nat/docs/overview
-[iap]: https://cloud.google.com/iap/docs/using-tcp-forwarding
-
-#### Toolkit Runners (deployment group 1)
-
-The Toolkit [startup-script](../modules/scripts/startup-script/README.md)
-module supports boot-time configuration of VMs using "runners". Runners are
-configured as a series of scripts uploaded to Cloud Storage. A simple, standard
-[VM startup script][vmstartup] runs at boot-time, downloads the scripts from
-Cloud Storage and executes them in sequence.
-
-The script in this example performs the trivial task of creating a file as a
-simple demonstration of functionality. You can use the startup-script module
-to address more complex scenarios.
-
-[vmstartup]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux
-
-#### Packer Template (deployment group 2)
-
-The Packer module uses the startup-script module from the first deployment group
-and executes the script to produce a custom image.
-
-#### Slurm Cluster Based on Custom Image (deployment group 3)
-
-Once the Slurm cluster has been deployed we can test that our Slurm compute
-partition is using the custom image. Each compute node should contain the
-`hello.txt` file added by the startup-script.
-
-1. SSH into the login node `slurm-image-builder-001-login0`.
-2. Run a job that prints the contents of the added file:
-
-  ```bash
-  $ srun -N 2 cat /home/hello.txt
-  Hello World
-  Hello World
-  ```
-
-#### Quota Requirements for image-builder-v5-legacy.yaml
-
-For this example the following is needed in the selected region:
-
-* Compute Engine API: Images (global, not regional quota): 1 image per invocation of `packer build`
-* Compute Engine API: Persistent Disk SSD (GB): **~50 GB**
-* Compute Engine API: Persistent Disk Standard (GB): **~64 GB static + 32
-  GB/node** up to 704 GB
-* Compute Engine API: N2 CPUs: **4** (for short-lived Packer VM and Slurm login node)
-* Compute Engine API: C2 CPUs: **4** for controller node and **60/node** active
-  in `compute` partition up to 1,204
-* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only
-  needed for `compute` partition_
-* Compute Engine API: Resource policies: **one for each job in parallel** -
-  _only needed for `compute` partition_
-
 ### [image-builder.yaml] ![core-badge]
 
 This blueprint uses the [Packer template module][pkr] to create a custom VM
@@ -1056,39 +723,6 @@ For this example the following is needed in the selected region:
 [pfs-parallelstore.yaml]: ./pfs-parallelstore.yaml
 [Parallelstore]: ../modules/file-system/parallelstore/README.md
 
-### [cae-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
-
-The Computer Aided Engineering (CAE) blueprint captures a reference architecture
-where the right cloud components are assembled to optimally cater to the
-requirements of computationally-intensive CAE workloads. Specifically, it is
-architected around Google Cloud’s VM families that provide a high memory bandwidth
-and a balanced memory/flop ratio, which is particularly useful for per-core licensed
-CAE software. The solution caters also to large CAE use cases, requiring multiple nodes
-that are tightly-coupled via MPI. Special high-memory shapes support even very
-memory-demanding workloads with up to 16GB/core. For file IO, different Google managed
-high performance NFS storage services are available. For very IO demanding workloads,
-third party parallel file systems can be integrated. The scheduling of the workloads
-is done by a workload manager.
-
-The CAE blueprint is intended to be a starting point for more tailored explorations
-or installations of specific CAE codes, as provided by ISVs separately.
-
-A detailed documentation is provided in this [README](cae/README.md).
-
-#### Quota Requirements for cae-slurm-v5-legacy.yaml
-
-For this example the following is needed in the selected region:
-
-* Cloud Filestore API: Basic SSD capacity (GB) per region: **5,120 GB**
-* Cloud Filestore API: High Scale SSD capacity (GB) per region: **10,240 GB**
-* Compute Engine API: H3 CPUs: **88/node** active in `balance` partition up to 880
-* Compute Engine API: C3-highmem CPUs: **176/node** active in `highmem` partition up to 1,760
-* Compute Engine API: N1 CPUs: **8/node** active in `desktop` partition up to 40
-* Compute Engine API: T4 GPUs: **1/node** active in `desktop` partition up to 5
-* Compute Engine API: N2 CPUs: **8** for login and **16** for controller
-
-[cae-slurm-v5-legacy.yaml]: ../examples/cae/cae-slurm-v5-lgacy.yaml
-
 ### [cae-slurm.yaml] ![core-badge]
 
 The Computer Aided Engineering (CAE) blueprint captures a reference architecture
@@ -1141,46 +775,6 @@ The blueprint contains 3 groups:
 
 [hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml
 
-### [hpc-slurm-ubuntu2004-v5-legacy.yaml] ![community-badge] ![deprecated-badge]
-
-> **Warning**: The variables `enable_reconfigure`,
-> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to
-> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**.
->
-> ```shell
-> # Install Python3 and run
-> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt
-> ```
-
-Similar to the [hpc-slurm-v5-legacy.yaml] example, but using Ubuntu 20.04 instead of CentOS 7.
-[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cluster Toolkit team.
-
-The cluster will support 2 partitions named `debug` and `compute`.
-The `debug` partition is the default partition and runs on smaller
-`n2-standard-2` nodes. The `compute` partition is not default and requires
-specifying in the `srun` command via the `--partition` flag. The `compute`
-partition runs on compute optimized nodes of type `cs-standard-60`. The
-`compute` partition may require additional quota before using.
-
-[Other operating systems]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
-[hpc-slurm-ubuntu2004-v5-legacy.yaml]: ../community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
-
-#### Quota Requirements for hpc-slurm-ubuntu2004-v5-legacy.yaml
-
-For this example the following is needed in the selected region:
-
-* Cloud Filestore API: Basic HDD (Standard) capacity (GB): **1,024 GB**
-* Compute Engine API: Persistent Disk SSD (GB): **~50 GB**
-* Compute Engine API: Persistent Disk Standard (GB): **~50 GB static + 50
-  GB/node** up to 1,250 GB
-* Compute Engine API: N2 CPUs: **12**
-* Compute Engine API: C2 CPUs: **4** for controller node and **60/node** active
-  in `compute` partition up to 1,204
-* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only
-  needed for `compute` partition_
-* Compute Engine API: Resource policies: **one for each job in parallel** -
-  _only needed for `compute` partition_
-
 ### [hpc-slurm-ubuntu2004.yaml] ![community-badge]
 
 Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7.
@@ -1212,18 +806,6 @@ For this example the following is needed in the selected region:
 * Compute Engine API: Resource policies: **one for each job in parallel** -
   _only needed for `compute` partition_
 
-### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] ![deprecated-badge]
-
-This example provisions a Slurm cluster using AMD VM machine types. It
-automates the initial setup of Spack, including a script that can be used to
-install the AMD Optimizing C/C++ Compiler ([AOCC]) and compile OpenMPI with
-AOCC. It is more extensively discussed in a dedicated [README for AMD
-examples][amd-examples-readme].
-
-[hpc-amd-slurm-v5-legacy.yaml]: ../community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml
-[AOCC]: https://developer.amd.com/amd-aocc/
-[amd-examples-readme]: ../community/examples/AMD/README.md
-
 ### [hpc-amd-slurm.yaml] ![community-badge]
 
 This example provisions a Slurm cluster using AMD VM machine types. It
@@ -1380,17 +962,6 @@ the nodes are provisioned. All nodes mount a filestore instance on `/home`.
 [omnia-github]: https://github.com/dellhpc/omnia
 [omnia-cluster.yaml]: ../community/examples/omnia-cluster.yaml
 
-### [hpc-slurm-local-ssd-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
-
-This blueprint demonstrates the use of Slurm and Filestore, with the definition
-of a partition which deploys compute nodes that have local ssd drives deployed.
-Before deploying this blueprint, one must first ensure to have an existing VPC
-properly configured (allowing Internet access and allowing inter virtual
-machine communications, for NFS and also for communications between the Slurm
-nodes)
-
-[hpc-slurm-local-ssd-v5-legacy.yaml]: ../community/examples/hpc-slurm-local-ssd-v5-legacy.yaml
-
 ### [hpc-slurm-local-ssd.yaml] ![community-badge] ![experimental-badge]
 
 This blueprint demonstrates the use of Slurm and Filestore, with compute nodes
@@ -1628,18 +1199,6 @@ walks through the use of this blueprint.
 [htc-htcondor.yaml]: ../community/examples/htc-htcondor.yaml
 [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm
 
-### [htc-slurm-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
-
-This blueprint provisions a cluster using the Slurm scheduler in a configuration
-tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs.
-
-For more information see:
-
-* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md)
-* [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html)
-
-[htc-slurm-v5-legacy.yaml]: ../community/examples/htc-slurm-v5-legacy.yaml
-
 ### [htc-slurm.yaml] ![community-badge]
 
 This blueprint provisions a cluster using the Slurm scheduler in a configuration
@@ -1693,20 +1252,6 @@ tutorial.
 
 [tutorial-fluent.yaml]: ../community/examples/tutorial-fluent.yaml
 
-### [hpc-slurm-chromedesktop-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
-
-This example shows how to use the `chrome-remote-desktop` module with a Slurm
-partition to be able to `salloc` a GPU accelerated remote desktop.
-
-After deploying the blueprint perform the following actions:
-1. SSH to the Slurm login node or controller.
-1. Provision a remote desktop with the following command: `salloc -p desktop -N
-   1`
-1. Once you see `salloc: Nodes slurmchrom-desktop-ghpc-0 are ready for job`,
-   follow the [instructions to set up the remote desktop][crd-instructions].
-
-[crd-instructions]: ../community/modules/remote-desktop/chrome-remote-desktop/README.md#setting-up-the-remote-desktop
-[hpc-slurm-chromedesktop-v5-legacy.yaml]: ../community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml
 ### [flux-cluster.yaml] ![community-badge] ![experimental-badge]
 
 The [flux-cluster.yaml] blueprint describes a flux-framework cluster where flux
diff --git a/examples/cae/README.md b/examples/cae/README.md
index 6107bf831e..8cc2274e52 100644
--- a/examples/cae/README.md
+++ b/examples/cae/README.md
@@ -1,7 +1,3 @@
-> **_NOTE:_** This document uses Slurm-GCP v5 version of CAE blueprint. You can
-> also use Slurm-GCP v6 version of the CAE blueprint in this folder. it would
-> require to append "-v6" suffix at the end of blueprint name/ deployment folder.
-
 # Computer Aided Engineering (CAE) Reference Architecture
 
 The Computer Aided Engineering (CAE) [blueprint](./cae-slurm.yaml) in
diff --git a/examples/cae/cae-slurm-v5-legacy.yaml b/examples/cae/cae-slurm-v5-legacy.yaml
deleted file mode 100644
index 01dddbecdb..0000000000
--- a/examples/cae/cae-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-#
-#           ****************
-#######  CAE Solution Blueprint #######
-#           ****************
-#
-# This blueprint features a reference design suited for CAE applications on GCP.
-# It sets up the following infrastructure:
-#   * Google's H3 VMs, ideally suited for CAE workloads
-#   * Google's C3-highmem VM, suited for workloads with 16GB/core requirement
-#   * Google's Filestore NFS-based shared storage
-#   * Google's Chrome Remote Desktop
-#   * SLURM workload scheduler
-#
-blueprint_name: cae-slurm
-vars:
-  project_id: ## Set GCP Project ID Here ##
-  deployment_name: cae-slurm
-  # check here for other regions with H3 deployments: https://cloud.google.com/compute/docs/regions-zones
-  # For example
-  # region: europe-west4
-  # zone: europe-west4-b
-  region: us-central1
-  zone: us-central1-a
-  # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-  # for a list of valid family options with Slurm; note: the image types for the compute nodes
-  # and the Chrome Remote Desktop (CRD) need to have the same Slurm base.
-  instance_image:
-    family: slurm-gcp-5-12-hpc-centos-7
-    project: schedmd-slurm-public
-  crd_instance_image:
-    family: slurm-gcp-5-12-debian-11   # must be Debian for CRD
-    project: schedmd-slurm-public
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-deployment_groups:
-
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-#
-#
-# Deployment Group: Setup
-#
-# Sets up VPC network, persistent NFS shares, dashboard
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-- group: setup
-  modules:
-
-  ####### Virtual Private Cloud Setup #######
-  # This creates a virtual private network for your cloud setup
-  - id: network1
-    source: modules/network/vpc
-    settings:
-      network_name: cae-slurm-net
-      subnetwork_name: primary-subnet
-
-  ####### User Home Storage #######
-  # This block creates an NFS file share for /home
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_tier: BASIC_SSD
-      size_gb: 2560
-      filestore_share_name: homeshare
-      local_mount: /home
-
-  ####### Shared Software Storage #######
-  # This block creates NFS file share for shared software installations
-  - id: appsfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_tier: BASIC_SSD
-      size_gb: 2560
-      filestore_share_name: appsshare
-      local_mount: /apps
-
-  ####### Dashboard #######
-  # This module activates integration with a dashboard on the Google Cloud Console
-  - id: hpc_dash
-    source: modules/monitoring/dashboard
-    outputs: [instructions]
-
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-#
-#
-# Deployment Group: Software Installation
-#
-# This deployment group is a stub for installing software before
-# bringing up the actual cluster.
-# See the README.md for useful software deployment patterns.
-#
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# - group: software_installation
-#   modules:
-
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-#
-#
-# Deployment Group: Cluster
-#
-# Provisions the actual CAE cluster with compute partitions,
-# remote desktop partition and connects to the previously set up
-# NFS shares.
-# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-- group: cluster
-  modules:
-
-  #######  Scratch #######
-  # This block creates an NFS file share for scratch. If you experience an IO bottleneck,
-  # consider to use the more performant version HIGH_SCALE_SSD with the following settings:
-  - id: scratchfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      filestore_tier: HIGH_SCALE_SSD
-      size_gb: 10240    # smallest size for HIGH_SSD_SCALE
-      filestore_share_name: scratchshare
-      local_mount: /scratch
-
-  # If you require maximum IO performance, you can consider to bring up a dedicated parallel
-  # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Parallelstore.
-  # Note: Those solutions may have associated license cost.
-  #
-  # Please visit here for more information
-  # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md
-  # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale
-  # - Parallelstore: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/file-system/parallelstore/README.md
-
-  ######## Remote Desktop(s) #######
-  # This block enables a partition for nodes that support Chrome Remote Desktop
-  # see here for use: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#hpc-slurm-chromedesktopyaml--
-  - id: remotedesktop
-    source: community/modules/remote-desktop/chrome-remote-desktop
-    use: [network1]
-    settings:
-      install_nvidia_driver: true
-      # instance_count: 0 will create installation scripts only
-      # which can be used with slurm node provisioning
-      instance_count: 0
-
-  - id: remotedesktop_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      machine_type: n1-standard-8
-      node_count_dynamic_max: 5
-      instance_image: $(vars.crd_instance_image)
-      guest_accelerator:
-      - type: nvidia-tesla-t4-vws
-        count: 1
-
-  - id: remotedesktop_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - appsfs
-    - scratchfs
-    - remotedesktop
-    - remotedesktop_node_group
-    settings:
-      partition_name: desktop
-      enable_placement: false
-      partition_startup_scripts_timeout: 900
-
-  ####### Balanced partition #######
-  # this block creates a partition uses GCP H3-standard VM for regular jobs with 4GB/core
-  - id: h3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 10
-      machine_type: h3-standard-88
-      disk_type: 'pd-balanced'
-      bandwidth_tier: gvnic_enabled
-
-  - id: h3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - appsfs
-    - scratchfs
-    - h3_node_group
-    settings:
-      partition_name: balance
-      is_default: true
-
-  ####### High-Mem partition #######
-  #  this block creates partition uses GCP C3-highmem VM for jobs with 16GB/core requirement
-  - id: c3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 10
-      machine_type: c3-highmem-176
-      disk_type: 'pd-balanced'
-      bandwidth_tier: tier_1_enabled
-
-  - id: c3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - appsfs
-    - scratchfs
-    - c3_node_group
-    settings:
-      partition_name: highmem
-
-  ####### Scheduler: SLURM #######
-  # This block creates a SLURM controller
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - appsfs
-    - scratchfs
-    - h3_partition
-    - c3_partition
-    - remotedesktop_partition
-    settings:
-      machine_type: n2-standard-16
-      compute_startup_scripts_timeout: 900
-      cloud_parameters:
-        resume_rate: 0
-        resume_timeout: 900
-        suspend_rate: 0
-        suspend_timeout: 300
-        no_comma_params: false
-
-  ####### Scheduler: SLURM #######
-  # This block creates a SLURM login node
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-8
diff --git a/examples/hpc-enterprise-slurm-v5-legacy.yaml b/examples/hpc-enterprise-slurm-v5-legacy.yaml
deleted file mode 100644
index 99e831ca60..0000000000
--- a/examples/hpc-enterprise-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hpc-enterprise-slurm
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: hpc01
-  region: us-central1
-  zone: us-central1-a
-  gpu_zones: [us-central1-a, us-central1-b, us-central1-c, us-central1-f]
-  slurm_image:
-    # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-    # for a list of valid family options with Slurm
-    family: slurm-gcp-5-12-hpc-centos-7
-    project: schedmd-slurm-public
-  # If image above is changed to use custom image, then setting below must be set to true
-  instance_image_custom: false
-  # Set to true for active cluster reconfiguration.
-  # Note that setting this option requires additional dependencies to be installed locally.
-  # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#description
-  enable_reconfigure: true
-  # When set, active compute nodes will be cleaned up on destroy.
-  # Note that setting this option requires additional dependencies to be installed locally.
-  enable_cleanup_compute: true
-  metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243
-    VmDnsSetting: GlobalOnly
-
-# Recommended to use GCS backend for Terraform state
-# See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state
-#
-# terraform_backend_defaults:
-#  type: gcs
-#  configuration:
-#    bucket: <<BUCKET_NAME>>
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
-  # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  - id: network1
-    source: modules/network/pre-existing-vpc
-
-  - id: controller_sa
-    source: community/modules/project/service-account
-    settings:
-      name: controller
-      project_roles:
-      - compute.instanceAdmin.v1
-      - iam.serviceAccountUser
-      - logging.logWriter
-      - monitoring.metricWriter
-      - pubsub.admin
-      - storage.objectViewer
-
-  - id: login_sa
-    source: community/modules/project/service-account
-    settings:
-      name: login
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - storage.objectViewer
-
-  - id: compute_sa
-    source: community/modules/project/service-account
-    settings:
-      name: compute
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - storage.objectCreator
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: projectsfs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /projects
-
-  # This file system has an associated license cost.
-  # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
-  - id: scratchfs
-    source: community/modules/file-system/DDN-EXAScaler
-    use: [network1]
-    settings:
-      local_mount: /scratch
-
-  - id: n2_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-      instance_image: $(vars.slurm_image)
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  - id: n2_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [n2_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: n2
-      exclusive: false  # allows nodes to stay up after jobs are done
-      enable_placement: false  # the default is: true
-      is_default: true
-      partition_conf:
-        SuspendTime: 300 # time (in secs) the nodes in this partition stay active after their tasks have completed
-
-  - id: c2_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      machine_type: c2-standard-60  # this is the default
-      instance_image: $(vars.slurm_image)
-      bandwidth_tier: tier_1_enabled
-      disk_type: pd-ssd
-      disk_size_gb: 100
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  # use `-p c2` to submit jobs to this partition:
-  # ex: `srun -p c2 -N 1 hostname`
-  - id: c2_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [c2_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: c2
-      # the following two are true by default
-      exclusive: true  # this must be true if enable_placement is true
-      enable_placement: true
-
-  - id: c2d_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      machine_type: c2d-standard-112
-      instance_image: $(vars.slurm_image)
-      bandwidth_tier: tier_1_enabled
-      disk_type: pd-ssd
-      disk_size_gb: 100
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  - id: c2d_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [c2d_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: c2d
-
-  - id: c3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      machine_type: c3-highcpu-176
-      instance_image: $(vars.slurm_image)
-      bandwidth_tier: tier_1_enabled
-      disk_type: pd-ssd
-      disk_size_gb: 100
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  - id: c3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [c3_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: c3
-
-  - id: a2_8_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 16
-      machine_type: a2-ultragpu-8g
-      bandwidth_tier: gvnic_enabled
-      instance_image: $(vars.slurm_image)
-      disk_type: pd-ssd
-      disk_size_gb: 100
-      node_conf:
-        Sockets: 2
-        CoresPerSocket: 24
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  # use `-p a208` to submit jobs to this partition:
-  # ex: `srun -p a208 --gpus-per-node=8 -N 1 nvidia-smi`
-  - id: a2_8_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [a2_8_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: a208
-      # This makes this partition look for machines in any of the following zones
-      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies
-      zones: $(vars.gpu_zones)
-      # The following allows users to use more host memory without specifying cpus on a job
-      partition_conf:
-        DefMemPerGPU: 160000
-        DefMemPerCPU: null
-
-  - id: a2_16_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 16
-      machine_type: a2-megagpu-16g
-      bandwidth_tier: gvnic_enabled
-      instance_image: $(vars.slurm_image)
-      disk_type: pd-ssd
-      disk_size_gb: 100
-      node_conf:
-        Sockets: 2
-        CoresPerSocket: 24
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  # use `-p a216` to submit jobs to this partition:
-  # ex: `srun -p a216 --gpus-per-node=16 -N 1 nvidia-smi`
-  - id: a2_16_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [a2_16_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: a216
-      # This makes this partition look for machines in any of the following zones
-      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies
-      zones: $(vars.gpu_zones)
-      # The following allows users to use more host memory without specifying cpus on a job
-      partition_conf:
-        DefMemPerGPU: 160000
-        DefMemPerCPU: null
-
-  - id: h3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 16
-      machine_type: h3-standard-88
-      bandwidth_tier: gvnic_enabled  # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_network
-      instance_image: $(vars.slurm_image)
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-      # H3 does not support pd-ssd and pd-standard
-      # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
-      disk_type: pd-balanced
-      disk_size_gb: 100
-
-  # use `-p h3` to submit jobs to this partition:
-  # ex: `srun -p h3  -N 1 hostname`
-  - id: h3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use: [h3_node_group, network1, homefs, projectsfs, scratchfs]
-    settings:
-      partition_name: h3
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use: [network1, homefs, projectsfs, scratchfs, n2_partition,
-          c2_partition, c2d_partition, c3_partition, a2_8_partition, a2_16_partition,
-          h3_partition]
-    settings:
-      instance_image: $(vars.slurm_image)
-      # the following allow for longer boot time
-      # which is useful for large GPU nodes
-      cloud_parameters:
-        no_comma_params: false
-        resume_rate: 0
-        resume_timeout: 600
-        suspend_rate: 0
-        suspend_timeout: 600
-      # we recommend disabling public IPs if possible
-      # but that requires your network to have a NAT or
-      # private access configured
-      disable_controller_public_ips: false
-      service_account:
-        email: $(controller_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      instance_image: $(vars.slurm_image)
-      machine_type: n2-standard-4
-      disable_login_public_ips: false
-      service_account:
-        email: $(login_sa.service_account_email)
-        scopes:
-        - https://www.googleapis.com/auth/cloud-platform
-
-  - id: hpc_dashboard
-    source: modules/monitoring/dashboard
-    outputs: [instructions]
diff --git a/examples/hpc-slurm-v5-legacy.yaml b/examples/hpc-slurm-v5-legacy.yaml
deleted file mode 100644
index 234277208d..0000000000
--- a/examples/hpc-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hpc-slurm
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: hpc-small
-  region: us-central1
-  zone: us-central1-a
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
-  # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  - id: debug_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-
-  - id: debug_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - debug_node_group
-    settings:
-      partition_name: debug
-      exclusive: false # allows nodes to stay up after jobs are done
-      enable_placement: false # the default is: true
-      is_default: true
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - compute_node_group
-    settings:
-      partition_name: compute
-
-  - id: h3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      machine_type: h3-standard-88
-      # H3 does not support pd-ssd and pd-standard
-      # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
-      disk_type: pd-balanced
-      bandwidth_tier: gvnic_enabled
-
-  - id: h3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - h3_node_group
-    settings:
-      partition_name: h3
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - debug_partition
-    - compute_partition
-    - h3_partition
-    - homefs
-    settings:
-      disable_controller_public_ips: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-4
-      disable_login_public_ips: false
diff --git a/examples/image-builder-v5-legacy.yaml b/examples/image-builder-v5-legacy.yaml
deleted file mode 100644
index c48627f85d..0000000000
--- a/examples/image-builder-v5-legacy.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-# Deploying the various groups of this blueprint requires passing the output of
-# the primary group to the packer group. Instructions for how to do that are
-# available at the following link:
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#image-builderyaml-
-
-blueprint_name: image-builder
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: image-builder-001
-  region: us-central1
-  zone: us-central1-c
-  custom_image:
-    family: my-slurm-image
-    project: $(vars.project_id)
-  disk_size: 32
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: scripts_for_image
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: generate_hello.sh
-        content: |
-          #!/bin/sh
-          echo "Hello World" > /home/hello.txt
-
-- group: packer
-  modules:
-  - id: custom-image
-    source: modules/packer/custom-image
-    kind: packer
-    use:
-    - network1
-    - scripts_for_image
-    settings:
-      source_image_project_id: [schedmd-slurm-public]
-      # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-5-12-hpc-centos-7
-      # You can find size of source image by using following command
-      # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
-      disk_size: $(vars.disk_size)
-      image_family: $(vars.custom_image.family)
-      state_timeout: 15m
-
-- group: cluster
-  modules:
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      disk_size_gb: $(vars.disk_size)
-      instance_image: $(vars.custom_image)
-      instance_image_custom: true
-      bandwidth_tier: gvnic_enabled
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - compute_node_group
-    settings:
-      partition_name: compute
-      is_default: true
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - compute_partition
-    settings:
-      disable_controller_public_ips: false
-      disk_size_gb: $(vars.disk_size)
-      instance_image: $(vars.custom_image)
-      instance_image_custom: true
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      disable_login_public_ips: false
-      disk_size_gb: $(vars.disk_size)
-      instance_image: $(vars.custom_image)
-      instance_image_custom: true
diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md
deleted file mode 100644
index 96087ef64c..0000000000
--- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md
+++ /dev/null
@@ -1,342 +0,0 @@
-# Objective
-
-> [!CAUTION]
-> This solution is built upon "legacy" blueprints using Slurm-GCP v5. The
-> [solution using v6](../README.md) is recommended for all new deployments.
-> The legacy solution is presented for customers with existing deployments. We
-> recommend maintaining existing deployments with the version of the Toolkit
-> originally used to deploy.
-
-This document will guide you to successfully provisioning a Slurm cluster with
-a3-highgpu-8g compute nodes running NVIDIA H100 GPUs.
-
-## Before starting
-
-> [!IMPORTANT]
-> Before beginning, submit a request to your Google Cloud representative for
-> access to the Deep Learning VM Image for a3-highgpu-8g. It is currently
-> available only by Private Preview request. This image contains patches that
-> significantly enhance the network performance of workloads that span multiple
-> a3-highgpu-8g VMs. You will use the image ID in the steps shown below.
-
-## Required setup
-
-Please follow the initial instructions for:
-
-- Installing Cluster Toolkit [dependencies][tkdeps] (Go, Terraform, Packer)
-- Installing the Cluster [Toolkit][tkinstall]
-
-Verify that your release of the Cluster Toolkit is greater than 1.31.1 and less
-than or equal to 1.37.0.
-
-```shell
-gcluster --version
-```
-
-The solution requires several Python packages to be available. We recommend
-installing them in a Python virtual environment:
-
-```shell
-python3 -m venv toolkit-a3
-source toolkit-a3/bin/activate
-pip3 install -r \
-    https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt
-```
-
-**Always** activate the environment before running any gcluster commands such as
-deploy or destroy.
-
-```shell
-source /absolute/path/to/toolkit-a3/bin/activate
-```
-
-## Top-Level Design of Solution
-
-The solution is split into 3 Cluster Toolkit blueprints:
-
-1. Provision 1 system network and 1 Filestore instance for mounting `/home`
-across the cluster.
-2. Build a custom image installing Slurm in an Ubuntu 20.04 image. The image
-runs a kernel patched with performance enhancements for the a3-highgpu-8g VM.
-3. Provision 4 GPU networks and a Slurm cluster using the custom image.
-
-The 1st and 2nd blueprints should be provisioned once and rarely need further
-modification. This approach separates the lifecycle of a Filestore instance from
-the lifecycle of the cluster, allowing the cluster to be deleted while retaining
-access to data and home directories. The 3rd cluster blueprint may be more
-frequently updated and re-provisioned as discussed below.
-
-## First time considerations
-
-> [!IMPORTANT]
-> These steps do not need to be repeated when a cluster is re-provisioned. They
-> are initial setup steps in a project.
-
-Replace the values for `PROJECT_ID`, `REGION`, and `ZONE` with the project,
-region, and zone in which you have an a3-highgpu-8g allocation. The value for
-`BUCKET` must be unique and will be used to create a new bucket. After replacing
-the values, execute them so that they automatically populate parameters in the
-commands shown below. Note that each a3-highgpu-8g VM (`N_VMS`) contains 8 NVIDIA
-H100 GPUs.
-
-```shell
-export PROJECT_ID=customer-project-id
-export BUCKET=customer-bucket
-export REGION=customer-region
-export ZONE=customer-zone
-export N_VMS=32
-```
-
-### Saving Terraform state
-Create a bucket with versioning enabled to store Terraform state:
-
-```shell
-gcloud storage buckets create gs://${BUCKET} --project=${PROJECT_ID} \
-    --default-storage-class=STANDARD --location=${REGION} \
-    --uniform-bucket-level-access
-gcloud storage buckets update gs://${BUCKET} --versioning
-```
-
-Modify all 3 blueprints to configure the new bucket to serve as a Terraform
-remote backend:
-
-```yaml
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: customer-bucket # modify to bucket created above
-```
-
-### Set default values
-
-Modify the the deployment variables `project_id`, `region`, `zone`, in the
-`vars` block of all 3 blueprints:
-
-```yaml
-  project_id: customer-project
-  region: customer-region
-  zone: customer-zone
-```
-
-### Set kernel-patched OS image
-
-Obtain values for `source_image_project_id` and `source_image` from your Google
-Cloud representative. Set them at approximately lines 33 and 34 of
-`ml-slurm-a3-1-image.yaml`.
-
-```yaml
-  source_image_project_id: source-image-project-id # use value supplied by Google Cloud staff
-  source_image: source-image-name                  # use value supplied by Google Cloud staff
-```
-
-### Reservation created by Google
-
-> [!IMPORTANT]
-> If you have ***not*** received a VM reservation from Google Cloud staff, then
-> skip this step and proceed to [manual reservation creation](#manual-creation-of-reservation).
-
-Set the deployment variable `a3_reservation_name` at approximately line 38 of
-`ml-slurm-a3-2-cluster.yaml` to the reservation name provided by Google. The
-value for `a3_maintenance_interval` should also be set as directed by Google
-staff. A common setting is `PERIODIC`, shown below, but this value must be
-confirmed with Google staff.
-
-```yaml
-  # a3_reservation_name must be specified; if Google staff have provided you
-  # with a reservation name, use it. Otherwise supply user-created reservation.
-  a3_reservation_name: reservation-name-provided-by-google
-  # a3_maintenance_interval should be empty string by default; if Google staff
-  # have created a reservation, they will also provide a3_maintenance_interval
-  a3_maintenance_interval: PERIODIC
-```
-
-### Manual creation of reservation
-
-> [!IMPORTANT]
-> If you received a VM reservation from Google Cloud staff, then skip this step
-> after confirming that you followed the instructions in [reservation created by
-> Google](#reservation-created-by-google).
-
-We recommend creating a reservation to ensure reliable access to re-create VMs
-if you need to redeploy or otherwise maintain your cluster.
-
-```shell
-gcloud compute reservations create a3-reservation-0 \
-    --project=${PROJECT_ID} \
-    --machine-type=a3-highgpu-8g \
-    --vm-count=${N_VMS} \
-    --zone=${ZONE} \
-    --require-specific-reservation \
-    --log-http
-```
-
-This reservation be must be specified when creating VMs with matching parameters
-(e.g. a3-highgpu-8g VM in configured zone). If you executed the command above
-without modification, you may leave `a3_reservation_name` and
-`a3_maintenance_interval` at their default values in
-`ml-slurm-a3-2-cluster.yaml`. Otherwise, ensure that the reservation name in the
-blueprint matches the name of the user-created reservation.
-
-```yaml
-  # a3_reservation_name must be specified; if Google staff have provided you
-  # with a reservation name, use it. Otherwise supply user-created reservation.
-  a3_reservation_name: a3-reservation-0
-  # a3_maintenance_interval should be empty string by default; if Google staff
-  # have created a reservation, they will also provide a3_maintenance_interval
-  a3_maintenance_interval: ""
-```
-
-### Set cluster size
-
-At approximately line 37 of `ml-slurm-a3-2-cluster.yaml`, set the static cluster
-size. Recall that there are 8 NVIDIA H100 GPUs per a3-highgpu-8g VM.
-
-```yaml
-  a3_static_cluster_size: 32
-```
-
-## Cluster creation
-
-The blueprint `ml-slurm-a3-0-base.yaml` will create 1 system network and a
-Filestore `/home` filesystem. Run the standard Toolkit workflow at the command
-line (approx. 5 minutes):
-
-```shell
-gcluster deploy ml-slurm-a3-0-base.yaml --auto-approve
-```
-
-Several values will be output to the screen. The output will be similar to:
-
-```hcl
-network_name_sysnet = "sys-net"
-network_storage_homefs = {
-  "client_install_runner" = {
-    "destination" = "install-nfs_home.sh"
-    "source" = "modules/embedded/modules/file-system/filestore/scripts/install-nfs-client.sh"
-    "type" = "shell"
-  }
-  "fs_type" = "nfs"
-  "local_mount" = "/home"
-  "mount_options" = "defaults,_netdev"
-  "mount_runner" = {
-    "args" = "\"10.224.153.226\" \"/nfsshare\" \"/home\" \"nfs\" \"defaults,_netdev\""
-    "destination" = "mount_home.sh"
-    "source" = "modules/embedded/modules/file-system/filestore/scripts/mount.sh"
-    "type" = "shell"
-  }
-  "remote_mount" = "/nfsshare"
-  "server_ip" = "10.224.153.226"
-}
-subnetwork_name_sysnet = "sys-subnet"
-```
-
-Build the custom image using ml-slurm-a3-1-image.yaml and the same workflow
-as above. Run at the command line:
-
-```shell
-gcluster deploy ml-slurm-a3-1-image.yaml --auto-approve
-```
-
-The image will take approximately 30 minutes to build.
-
-> [!IMPORTANT]
-> You must modify `ml-slurm-a3-2-cluster.yaml` to update the IP address of the
-> Filestore instance for `/home`. Your IP address will differ from that shown
-> below and must match the output from deploying the base blueprint above:
->
-> ```yaml
->   server_ip_homefs: 10.224.153.226
-> ```
-
-Provision the cluster blueprint (approximately 5-10 minutes):
-
-```shell
-gcluster deploy ml-slurm-a3-2-cluster.yaml --auto-approve
-```
-
-## Receive Data Path Manager (RxDM)
-
-To achieve optimal application performance, an additional service called the
-"Receive Data Path Manager" (RxDM) must run with the same lifetime as the job.
-Additionally, a NCCL plugin must be installed into the execution environment of
-the workload. Both the RxDM and plugin are distributed by Docker container
-images.
-
-This blueprint includes a Slurm "Prolog" and "Epilog" script that will run
-before and after every job running on more than 1 a3-highgpu-8g compute node.
-The Prolog will perform the following actions:
-
-- Install the NCCL plugin into /var/lib of the host
-- Run the RxDM service
-  - This is a long-lived service that runs alongside the job
-  - Mounts `/var/lib/nvidia/lib64` into `/usr/lib/nvidia/lib64` of the container
-  - Mount `/opt/tcpdirect_benchmark/` from the host into the container so that a
-  textproto file defining the mapping from GPU to NIC is available. This file
-  is present in the Deep Learning VM (DLVM) images that contain TCPDirect
-  patches.
-  - Mount `/run/tcpx-${SLURM_JOB_ID}` from the container into the host. This is
-  set to the environment variables `${UDS_PATH}` in the script. This directory
-  contains Unix socket files that implement a TCPx interface available to the
-  user workload at `${UDS_PATH}`. The job must be configured to be aware of this
-  path using `NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX` environment variable!
-
-The Epilog will
-
-- Stop the RxDM service
-- Prune any stopped containers (freeing up disk space)
-- Remove the directory at `${UDS_PATH}`
-
-## Jobs using the RxDM / TCPx
-
-Jobs that are running across multiple a3-highgpu-8g VMs will benefit from using
-the RxDM and the NCCL plugin. An example containerized job is located at
-`/opt/apps/scripts/run-nccl-tests.sh`. In addition to setting standard NCCL
-configuration values, a job must:
-
-- Set `NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX` to `${UDS_PATH}`
-- Set the `LD_LIBRARY_PATH` to include `/var/lib/tcpx/lib64` and `/usr/local/nvidia/lib64`
-
-If job is containerized
-
-- Mount `${UDS_PATH}` into the container at the same path
-- Mount `/var/lib/tcpx/lib64` to `/var/lib/tcpx/lib64` in the container (to make the
-  NCCL plugin available)
-- Paths can be modified if `LD_LIBRARY_PATH` is likewise modified
-
-## Example workload (NCCL benchmark)
-
-The example workload below demonstrates the pattern recommended in Activating
-the Receive Data Path Manager during jobs while running the standard nccl-tests
-benchmark. It assumes the availability of a GPU/NIC topology file at
-`/opt/tcpdirect_benchmark/gpu_rxq_configuration.textproto`. This file is built
-into the DLVM images used by this solution, but may need to be provided if
-using an alternative image.
-
-### Clone the Cluster Toolkit repository containing the NCCL benchmark
-
-```shell
-git clone https://github.com/GoogleCloudPlatform/cluster-toolkit
-cd cluster-toolkit/examples/machine-learning/a3-highgpu-8g/nccl-tests
-```
-
-### Import the PyTorch image from the NVIDIA Container Registry
-
-```shell
-bash import_pytorch_container.sh
-```
-
-### Build NCCL
-
-```shell
-sbatch build-nccl-tests.sh
-```
-
-### Run NCCL tests
-
-```shell
-sbatch run-nccl-tests.sh
-```
-
-[consume]: https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_any_matching_reservation
-[tkdeps]: https://cloud.google.com/cluster-toolkit/docs/setup/install-dependencies
-[tkinstall]: https://github.com/GoogleCloudPlatform/cluster-toolkit/#quickstart
diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml
deleted file mode 100644
index 79c06980d3..0000000000
--- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-blueprint_name: slurm-a3-base
-
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: customer-tf-state-bucket
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-a3-base
-  region: customer-region
-  zone: customer-zone
-  sys_net_range: 172.16.0.0/16
-  filestore_ip_range: 192.168.0.0/29
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: sysnet
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-sysnet
-      network_address_range: $(vars.sys_net_range)
-      mtu: 8244
-      # using explicit var.subnetworks to allow for easier addition
-      # of multiple system subnetworks in other regions
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sysnet-subnet
-        subnet_region: $(vars.region)
-        new_bits: 4
-        subnet_private_access: true
-        description: primary subnetwork in $(vars.deployment_name)-sysnet
-    outputs:
-    - network_name
-    - subnetwork_name
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use:
-    - sysnet
-    settings:
-      filestore_tier: BASIC_SSD
-      size_gb: 2560
-      local_mount: /home
-      reserved_ip_range: $(vars.filestore_ip_range)
-    outputs:
-    - network_storage
diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml
deleted file mode 100644
index 08060286b6..0000000000
--- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-blueprint_name: slurm-a3-image
-
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: customer-tf-state-bucket
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-a3-image
-  region: customer-region
-  zone: customer-zone
-  disk_size: 200
-  final_image_family: slurm-dlvm
-  network_name_system: slurm-a3-base-sysnet
-  subnetwork_name_system: slurm-a3-base-sysnet-subnet
-  slurm_cluster_name: slurm0
-  source_image_project_id: source-image-project-id # use value supplied by Google Cloud staff
-  source_image: source-image-name                  # use value supplied by Google Cloud staff
-
-deployment_groups:
-- group: build_script
-  modules:
-  - id: sysnet
-    source: modules/network/pre-existing-vpc
-    settings:
-      network_name: $(vars.network_name_system)
-      subnetwork_name: $(vars.subnetwork_name_system)
-
-  - id: image_build_script
-    source: modules/scripts/startup-script
-    settings:
-      install_ansible: true
-      docker:
-        enabled: true
-        world_writable: true
-      configure_ssh_host_patterns:
-      - 10.0.0.*
-      - 10.1.0.*
-      - 10.2.0.*
-      - 10.3.0.*
-      - $(vars.slurm_cluster_name)*
-      runners:
-      - type: shell
-        destination: workaround_apt_change.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          rm -f /etc/apt/sources.list.d/kubernetes.list
-          apt-get update --allow-releaseinfo-change
-      - type: shell
-        destination: disable_dlvm_builtin_services.sh
-        content: |
-          #!/bin/bash
-          # many extra services are being started via /etc/rc.local; disable
-          # them on future boots of image
-          echo -e '#!/bin/bash\n/usr/bin/nvidia-persistenced --user root\nexit 0' > /etc/rc.local
-          # disable jupyter and notebooks-collection-agent services
-          systemctl stop jupyter.service notebooks-collection-agent.service
-          systemctl disable jupyter.service notebooks-collection-agent.service
-      - type: data
-        destination: /var/tmp/slurm_vars.json
-        content: |
-          {
-            "reboot": false,
-            "slurm_version": "23.02.7",
-            "install_cuda": false,
-            "nvidia_version": "latest",
-            "install_ompi": true,
-            "install_lustre": false,
-            "install_gcsfuse": true,
-            "monitoring_agent": "cloud-ops"
-          }
-      - type: shell
-        destination: install_slurm.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
-          ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.2 \
-              -i localhost, --limit localhost --connection=local \
-              -e @/var/tmp/slurm_vars.json \
-              ansible/playbook.yml
-      # this duplicates the ulimits configuration of the HPC VM Image
-      - type: data
-        destination: /etc/security/limits.d/99-unlimited.conf
-        content: |
-          * - memlock unlimited
-          * - nproc unlimited
-          * - stack unlimited
-          * - nofile 1048576
-          * - cpu unlimited
-          * - rtprio unlimited
-      - type: data
-        destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf
-        content: |
-          [Service]
-          LimitNOFILE=infinity
-      - type: data
-        destination: /etc/systemd/system/delay-a3.service
-        content: |
-          [Unit]
-          Description=Delay A3 boot until all network interfaces are routable
-          After=network-online.target
-          Wants=network-online.target
-          Before=google-startup-scripts.service
-
-          [Service]
-          ExecCondition=/bin/bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-highgpu-8g$"'
-          ExecStart=/usr/lib/systemd/systemd-networkd-wait-online -i enp6s0 -i enp12s0 -i enp134s0 -i enp140s0 -o routable --timeout=120
-          ExecStartPost=/bin/sleep 60
-
-          [Install]
-          WantedBy=multi-user.target
-      - type: shell
-        destination: install_enroot_pyxis.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          ### Setting up Enroot
-          if ! dpkg -l enroot &>/dev/null; then
-              arch=\$(dpkg --print-architecture)
-              curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot_3.4.1-1_${arch}.deb
-              curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot+caps_3.4.1-1_${arch}.deb # optional
-              apt-get update
-              apt-get install --assume-yes ./*.deb
-              rm enroot*.deb
-          fi
-          # configure enroot
-          # use single quotes around EOT to avoid shell interpolation
-          cat <<'EOT' > /etc/enroot/enroot.conf
-          ENROOT_RUNTIME_PATH    /mnt/localssd/${UID}/enroot/runtime
-          ENROOT_CACHE_PATH      /mnt/localssd/${UID}/enroot/cache
-          ENROOT_DATA_PATH       /mnt/localssd/${UID}/enroot/data
-          ENROOT_TEMP_PATH       /mnt/localssd/${UID}/enroot
-          EOT
-          ### Install Pyxis
-          if [ ! -f "/usr/local/lib/slurm/spank_pyxis.so" ]; then
-              git clone --depth 1 https://github.com/NVIDIA/pyxis.git
-              cd pyxis && make install && cd -
-              rm -rf pyxis
-              echo "required /usr/local/lib/slurm/spank_pyxis.so" > /etc/slurm/plugstack.conf
-          fi
-      - type: shell
-        destination: install_mdadm.sh
-        content: |
-          #!/bin/bash
-          apt-get update
-          apt-get install mdadm --no-install-recommends --assume-yes
-      - type: data
-        destination: /usr/local/ghpc/mount_localssd.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-
-          RAID_DEVICE=/dev/md0
-          DST_MNT=/mnt/localssd
-          DISK_LABEL=LOCALSSD
-          OPTIONS=discard,defaults
-
-          # if mount is successful, do nothing
-          if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then
-                  exit 0
-          fi
-
-          # Create new RAID, format ext4 and mount
-          # TODO: handle case of zero or 1 local SSD disk
-          # TODO: handle case when /dev/md0 exists but was not mountable for
-          # some reason
-          DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '`
-          NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l`
-          mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES
-          mkfs.ext4 -F "$RAID_DEVICE"
-          tune2fs "$RAID_DEVICE" -r 131072
-          e2label "$RAID_DEVICE" "$DISK_LABEL"
-          mkdir -p "$DST_MNT"
-          mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"
-          chmod 1777 "$DST_MNT"
-      - type: data
-        destination: /etc/systemd/system/mount-local-ssd.service
-        content: |
-          [Unit]
-          Description=Assemble local SSDs as software RAID; then format and mount
-
-          [Service]
-          ExecCondition=bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-highgpu-8g$"'
-          ExecStart=/bin/bash /usr/local/ghpc/mount_localssd.sh
-
-          [Install]
-          WantedBy=local-fs.target
-      - type: shell
-        destination: install_dcgm.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          apt-key del 7fa2af80
-          distribution=\$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
-          wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb
-          dpkg -i cuda-keyring_1.0-1_all.deb
-          apt-get update
-          apt-get install -y datacenter-gpu-manager
-          # libnvidia-nscq needed for A100/A800 and H100/H800 systems
-          apt-get install -y libnvidia-nscq-550
-      - type: shell
-        destination: add_dcgm_to_op_config.sh
-        content: |
-          #!/bin/bash
-          tee -a /etc/google-cloud-ops-agent/config.yaml > /dev/null << EOF
-          metrics:
-            receivers:
-              dcgm:
-                type: dcgm
-            service:
-              pipelines:
-                dcgm:
-                  receivers:
-                    - dcgm
-          EOF
-      - type: shell
-        destination: systemctl_services.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          # workaround b/309016676 (systemd-resolved restarts 4 times causing DNS
-          # resolution failures during google-startup-scripts.service)
-          systemctl daemon-reload
-          systemctl enable delay-a3.service
-          systemctl enable mount-local-ssd.service
-          systemctl enable nvidia-dcgm
-      - type: shell
-        destination: remove_snap_gcloud.sh
-        content: |
-          #!/bin/bash
-          # THIS RUNNER MUST BE THE LAST RUNNER BECAUSE IT WILL BREAK GSUTIL IN
-          # PARENT SCRIPT OF STARTUP-SCRIPT MODULE
-          set -e -o pipefail
-          # Remove original DLVM gcloud, lxds install due to conflict with snapd and NFS
-          snap remove google-cloud-cli lxd
-          # Install key and google-cloud-cli from apt repo
-          GCLOUD_APT_SOURCE="/etc/apt/sources.list.d/google-cloud-sdk.list"
-          if [ ! -f "${GCLOUD_APT_SOURCE}" ]; then
-              # indentation matters in EOT below; do not blindly edit!
-              cat <<EOT > "${GCLOUD_APT_SOURCE}"
-          deb [signed-by=/usr/share/keyrings/cloud.google.asc] https://packages.cloud.google.com/apt cloud-sdk main
-          EOT
-          fi
-          curl -o /usr/share/keyrings/cloud.google.asc https://packages.cloud.google.com/apt/doc/apt-key.gpg
-          apt-get update
-          apt-get install --assume-yes google-cloud-cli
-          # Clean up the bash executable hash for subsequent steps using gsutil
-          hash -r
-
-- group: slurm-build
-  modules:
-  - id: slurm-image
-    source: modules/packer/custom-image
-    kind: packer
-    use:
-    - image_build_script
-    - sysnet
-    settings:
-      # building this image does not require a GPU-enabled VM but must *not* be
-      # run on a N-series VM otherwise, the "open" drivers will not install
-      machine_type: c2d-standard-32
-      source_image_project_id: [$(vars.source_image_project_id)]
-      source_image: $(vars.source_image)
-      image_family: $(vars.final_image_family)
diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml
deleted file mode 100644
index b504650c7f..0000000000
--- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-blueprint_name: slurm-a3-cluster
-
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: customer-tf-state-bucket  # modify to be a bucket owned and writable by customer
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-a3-cluster
-  region: customer-region
-  zone: customer-zone
-  server_ip_homefs: 0.0.0.0 ## MUST set to IP address of Filestore instance from base deployment!
-  remote_mount_homefs: /nfsshare
-  local_mount_homefs: /home
-  zones:
-  - $(vars.zone)
-  disk_size_gb: 200
-  final_image_family: slurm-dlvm
-  slurm_cluster_name: slurm0
-  enable_reconfigure: true
-  enable_cleanup_compute: true
-  enable_cleanup_subscriptions: true
-  a3_partition_name: a3
-  a3_static_cluster_size: 32
-  # a3_reservation_name must be specified; if Google staff have provided you
-  # with a reservation name, use it. Otherwise supply user-created reservation.
-  a3_reservation_name: a3-reservation-0
-  # a3_maintenance_interval should be empty string by default; if Google staff
-  # have created a reservation, they will also provide a3_maintenance_interval
-  a3_maintenance_interval: ""
-  # network parameters must match base blueprint deployment_name!
-  # these values are accurate if deployment_name was not modified from example
-  network_name_system: slurm-a3-base-sysnet
-  subnetwork_name_system: slurm-a3-base-sysnet-subnet
-
-deployment_groups:
-- group: cluster
-  modules:
-  - id: sysnet
-    source: modules/network/pre-existing-vpc
-    settings:
-      network_name: $(vars.network_name_system)
-      subnetwork_name: $(vars.subnetwork_name_system)
-
-  - id: gpunets
-    source: modules/network/multivpc
-    settings:
-      global_ip_address_range: 10.0.0.0/9
-      network_name_prefix: $(vars.deployment_name)-gpunet
-      network_count: 4
-      subnetwork_cidr_suffix: 20
-
-  - id: homefs
-    source: modules/file-system/pre-existing-network-storage
-    settings:
-      server_ip: $(vars.server_ip_homefs)
-      remote_mount: $(vars.remote_mount_homefs)
-      local_mount: $(vars.local_mount_homefs)
-
-  - id: compute_sa
-    source: community/modules/project/service-account
-    settings:
-      name: compute
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - pubsub.subscriber
-      - storage.objectAdmin
-
-  - id: debug_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_static: 0
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-      instance_image_custom: true
-      instance_image:
-        family: $(vars.final_image_family)
-        project: $(vars.project_id)
-
-  - id: debug_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - debug_node_group
-    - sysnet
-    - homefs
-    settings:
-      partition_name: debug
-      exclusive: false
-      enable_placement: false
-
-  - id: a3_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    use:
-    - gpunets
-    settings:
-      reservation_name: $(vars.a3_reservation_name)
-      maintenance_interval: $(vars.a3_maintenance_interval)
-      node_count_static: $(vars.a3_static_cluster_size)
-      node_count_dynamic_max: 0
-      disk_type: pd-ssd
-      machine_type: a3-highgpu-8g
-      instance_image_custom: true
-      disable_public_ips: true
-      enable_smt: true
-      instance_image:
-        family: $(vars.final_image_family)
-        project: $(vars.project_id)
-      node_conf:
-        CoresPerSocket: 52
-        ThreadsPerCore: 2
-      on_host_maintenance: TERMINATE
-      service_account:
-        email: $(compute_sa.service_account_email)
-        scopes:
-        - cloud-platform
-      bandwidth_tier: gvnic_enabled
-
-  - id: a3_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - a3_node_group
-    - sysnet
-    - homefs
-    settings:
-      partition_name: $(vars.a3_partition_name)
-      enable_placement: false
-      exclusive: false
-      is_default: true
-      partition_conf:
-        OverSubscribe: EXCLUSIVE
-
-  - id: controller_startup
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: stage_scripts.sh
-        content: |
-          #!/bin/bash
-          # use script from master branch which is actively maintained
-          curl -s --create-dirs -o /opt/apps/adm/slurm/scripts/receive-data-path-manager \
-              https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager
-          chmod 0755 /opt/apps/adm/slurm/scripts/receive-data-path-manager
-          mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d
-          mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d
-          ln -s /opt/apps/adm/slurm/scripts/receive-data-path-manager /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d/start-rxdm.prolog_slurmd
-          ln -s /opt/apps/adm/slurm/scripts/receive-data-path-manager /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d/stop-rxdm.epilog_slurmd
-      - type: shell
-        destination: reset_enroot.sh
-        content: |
-          #!/bin/bash
-          # reset enroot to defaults of files under /home and running under /run
-          # allows basic enroot testing on login/controller nodes (reduced I/O)
-          rm -f /etc/enroot/enroot.conf
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - sysnet
-    - a3_partition
-    - debug_partition
-    - homefs
-    settings:
-      machine_type: c2-standard-8
-      cloud_parameters:
-        resume_rate: 0
-        resume_timeout: 900
-        suspend_rate: 0
-        suspend_timeout: 600
-        no_comma_params: false
-        tree_width: $(vars.a3_static_cluster_size)
-      instance_image_custom: true
-      instance_image:
-        family: $(vars.final_image_family)
-        project: $(vars.project_id)
-      slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl
-      controller_startup_script: $(controller_startup.startup_script)
-      enable_external_prolog_epilog: true
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - sysnet
-    - slurm_controller
-    settings:
-      disk_type: pd-balanced
-      instance_image_custom: true
-      instance_image:
-        family: $(vars.final_image_family)
-        project: $(vars.project_id)
-      machine_type: c2-standard-4
-      startup_script: |
-        #!/bin/bash
-        # reset enroot to defaults of files under /home and running under /run
-        # allows basic enroot testing on login node (reduced I/O)
-        rm -f /etc/enroot/enroot.conf
diff --git a/examples/ml-slurm-v5-legacy.yaml b/examples/ml-slurm-v5-legacy.yaml
deleted file mode 100644
index 113c052405..0000000000
--- a/examples/ml-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-blueprint_name: ml-slurm
-
-vars:
-  project_id:  ## Set project id here
-  deployment_name: ml-example
-  region: asia-southeast1
-  zone: asia-southeast1-b
-  zones:
-  - asia-southeast1-a
-  - asia-southeast1-b
-  - asia-southeast1-c
-  new_image:
-    family: ml-slurm
-    project: $(vars.project_id)
-  disk_size_gb: 200
-  metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243
-    VmDnsSetting: GlobalOnly
-
-# Recommended to use GCS backend for Terraform state
-# See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state
-#
-# terraform_backend_defaults:
-#  type: gcs
-#  configuration:
-#    bucket: <<BUCKET_NAME>>
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network
-    source: modules/network/pre-existing-vpc
-
-  # this example anticipates that the VPC default network has internal traffic
-  # allowed and IAP tunneling for SSH connections
-  - id: firewall_rule
-    source: modules/network/firewall-rules
-    use:
-    - network
-    settings:
-      ingress_rules:
-      - name: $(vars.deployment_name)-allow-internal-traffic
-        description: Allow internal traffic
-        destination_ranges:
-        - $(network.subnetwork_address)
-        source_ranges:
-        - $(network.subnetwork_address)
-        allow:
-        - protocol: tcp
-          ports:
-          - 0-65535
-        - protocol: udp
-          ports:
-          - 0-65535
-        - protocol: icmp
-      - name: $(vars.deployment_name)-allow-iap-ssh
-        description: Allow IAP-tunneled SSH connections
-        destination_ranges:
-        - $(network.subnetwork_address)
-        source_ranges:
-        - 35.235.240.0/20
-        allow:
-        - protocol: tcp
-          ports:
-          - 22
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use:
-    - network
-    settings:
-      local_mount: /home
-      size_gb: 2560
-      filestore_tier: BASIC_SSD
-
-  - id: script
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: install-ml-libraries.sh
-        content: |
-          #!/bin/bash
-          # this script is designed to execute on Slurm images published by SchedMD that:
-          # - are based on Debian distribution of Linux
-          # - have NVIDIA drivers pre-installed
-
-          set -e -o pipefail
-
-          echo "deb https://packages.cloud.google.com/apt google-fast-socket main" > /etc/apt/sources.list.d/google-fast-socket.list
-          apt-get update --allow-releaseinfo-change
-          apt-get install --assume-yes google-fast-socket
-
-          CONDA_BASE=/opt/conda
-
-          if [ -d $CONDA_BASE ]; then
-                  exit 0
-          fi
-
-          DL_DIR=\$(mktemp -d)
-          cd $DL_DIR
-          curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh
-          HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE
-          cd -
-          rm -rf $DL_DIR
-          unset DL_DIR
-
-          source $CONDA_BASE/bin/activate base
-          conda init --system
-          conda config --system --set auto_activate_base False
-          # following channel ordering is important! use strict_priority!
-          conda config --system --set channel_priority strict
-          conda update -n base conda --yes
-
-          ### create a virtual environment for tensorflow
-          conda create -n tf python=3.11 --yes
-          conda activate tf
-          pip install tensorflow[and-cuda]==2.18.*
-
-- group: packer
-  modules:
-  - id: custom-image
-    source: modules/packer/custom-image
-    kind: packer
-    use:
-    - network
-    - script
-    settings:
-      # give VM a public IP to ensure startup script can reach public internet
-      # w/o new VPC
-      omit_external_ip: false
-      source_image_project_id: [schedmd-slurm-public]
-      # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-5-12-debian-11
-      # You can find size of source image by using following command
-      # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
-      disk_size: $(vars.disk_size_gb)
-      disk_type: pd-ssd
-      image_family: $(vars.new_image.family)
-      # building this image does not require a GPU-enabled VM
-      machine_type: c2-standard-4
-      state_timeout: 15m
-
-- group: cluster
-  modules:
-  - id: examples
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: data
-        destination: /var/tmp/torch_test.sh
-        content: |
-          #!/bin/bash
-          source /etc/profile.d/conda.sh
-          conda activate pytorch
-          python3 torch_test.py
-      - type: data
-        destination: /var/tmp/torch_test.py
-        content: |
-          import torch
-          import torch.utils.benchmark as benchmark
-
-          def batched_dot_mul_sum(a, b):
-              '''Computes batched dot by multiplying and summing'''
-              return a.mul(b).sum(-1)
-
-          def batched_dot_bmm(a, b):
-              '''Computes batched dot by reducing to bmm'''
-              a = a.reshape(-1, 1, a.shape[-1])
-              b = b.reshape(-1, b.shape[-1], 1)
-              return torch.bmm(a, b).flatten(-3)
-
-          # use GPU if available, else CPU
-          device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-          print('Using device:', device)
-          if device.type == 'cuda':
-              print(torch.cuda.get_device_name(0))
-
-          # benchmarking
-          x = torch.randn(10000, 64)
-          t0 = benchmark.Timer(
-              stmt='batched_dot_mul_sum(x, x)',
-              setup='from __main__ import batched_dot_mul_sum',
-              globals={'x': x})
-          t1 = benchmark.Timer(
-              stmt='batched_dot_bmm(x, x)',
-              setup='from __main__ import batched_dot_bmm',
-              globals={'x': x})
-          print(t0.timeit(100))
-          print(t1.timeit(100))
-
-  - id: a2_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      bandwidth_tier: gvnic_enabled
-      machine_type: a2-highgpu-1g
-      instance_image: $(vars.new_image)
-      instance_image_custom: true
-
-  - id: a2_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - a2_node_group
-    - homefs
-    - network
-    settings:
-      partition_name: a2
-      is_default: true
-
-  - id: g2_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-      bandwidth_tier: gvnic_enabled
-      machine_type: g2-standard-4
-      instance_image: $(vars.new_image)
-      instance_image_custom: true
-
-  - id: g2_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - g2_node_group
-    - homefs
-    - network
-    settings:
-      partition_name: g2
-      enable_placement: false
-      exclusive: false
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network
-    - a2_partition
-    - g2_partition
-    - homefs
-    settings:
-      disable_controller_public_ips: false
-      instance_image: $(vars.new_image)
-      instance_image_custom: true
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - examples
-    - network
-    - slurm_controller
-    settings:
-      disable_login_public_ips: false
-      instance_image: $(vars.new_image)
-      instance_image_custom: true
diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml
deleted file mode 100644
index e7e9d5e09e..0000000000
--- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: test-slurm-lustre
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: test-slurm-lustre
-  region: us-central1
-  zone: us-central1-a
-  machine_type: n2-standard-2
-  disk_type: pd-ssd
-  # enable_placement: false
-  # on_host_maintenance: MIGRATE
-  num_nodes: 1
-  centos_image:
-    family: slurm-gcp-5-12-hpc-centos-7
-    project: schedmd-slurm-public
-  rocky_image:
-    family: slurm-gcp-5-12-hpc-rocky-linux-8
-    project: schedmd-slurm-public
-
-deployment_groups:
-- group: primary
-  modules:
-
-  - id: network1
-    source: modules/network/pre-existing-vpc
-
-  ###########
-  # Storage #
-  ###########
-
-  # This file system has an associated license cost.
-  # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
-  - id: lustre
-    source: community/modules/file-system/DDN-EXAScaler
-    use: [network1]
-    settings:
-      local_mount: /lustre
-      waiter: deploymentmanager
-      mgs:
-        nic_type: "GVNIC"
-        node_type: n2-standard-2
-        node_count: 1
-        node_cpu: "Intel Cascade Lake"
-        public_ip: true
-      mds:
-        nic_type: "GVNIC"
-        node_type: n2-standard-2
-        node_count: 1
-        node_cpu: "Intel Cascade Lake"
-        public_ip: true
-      oss:
-        nic_type: "GVNIC"
-        node_type: n2-standard-2
-        node_count: 3
-        node_cpu: "Intel Cascade Lake"
-        public_ip: true
-
-  #############
-  # Slurm VMs #
-  #############
-
-  # # Ubuntu 20.04 LTS
-  # - id: ubuntu_node_group
-  #   source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-  #   settings:
-  #     node_count_dynamic_max: $(vars.num_nodes)
-  #     instance_image:
-  #       family: slurm-gcp-5-12-ubuntu-2004-lts
-  #       project: schedmd-slurm-public
-
-  # - id: ubuntu_partition
-  #   source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-  #   use:
-  #   - network1
-  #   - ubuntu_node_group
-  #   - lustre
-  #   settings:
-  #     partition_name: ubuntu
-
-  # Rocky Linux 8
-  - id: rocky_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: $(vars.num_nodes)
-      instance_image: $(vars.rocky_image)
-
-  - id: rocky_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - rocky_node_group
-    - lustre
-    settings:
-      partition_name: rocky
-
-  # CentOS 7
-  - id: centos_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: $(vars.num_nodes)
-      instance_image: $(vars.centos_image)
-
-  - id: centos_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - centos_node_group
-    - lustre
-    settings:
-      partition_name: centos
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    # - ubuntu_partition
-    - rocky_partition
-    - centos_partition
-    - lustre
-    settings:
-      disable_controller_public_ips: false
-      # cloud_parameters:
-      #   no_comma_params: false
-      #   resume_rate: 0
-      #   resume_timeout: 1200
-      #   suspend_rate: 0
-      #   suspend_timeout: 1200
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      disable_login_public_ips: false
diff --git a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml b/tools/validate_configs/test_configs/gpu-v5-legacy.yaml
deleted file mode 100644
index 16f4a9fde8..0000000000
--- a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: gpu-vm
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: gpu-vm
-  region: us-central1
-  zone: us-central1-c
-  instance_image_vm:
-    family: common-dl-gpu-debian-10
-    project: ml-images
-
-# Broken into 3 groups to better manage GPU quotas
-deployment_groups:
-- group: high-count-auto
-  modules:
-  - id: network-hca
-    source: modules/network/pre-existing-vpc
-
-  - id: auto-megagpu
-    source: modules/compute/vm-instance
-    use:
-    - network-hca
-    settings:
-      name_prefix: auto-megagpu
-      machine_type: a2-megagpu-16g
-      instance_image: $(vars.instance_image_vm)
-
-- group: high-count-manual
-  modules:
-  - id: network-hcm
-    source: modules/network/pre-existing-vpc
-
-  - id: manual-megagpu
-    source: modules/compute/vm-instance
-    use:
-    - network-hcm
-    settings:
-      name_prefix: manual-megagpu
-      machine_type: a2-megagpu-16g
-      instance_image: $(vars.instance_image_vm)
-      guest_accelerator:
-      - type: nvidia-tesla-a100
-        count: 16
-
-- group: low-count
-  modules:
-  # Source is an embedded module, denoted by "modules/*" without ./, ../, /
-  # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  - id: network1
-    source: modules/network/pre-existing-vpc
-
-  - id: nogpu-n1
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: nogpu-n1
-      machine_type: n1-standard-8
-      instance_image: $(vars.instance_image_vm)
-
-  - id: manual-n1
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: manual-n1
-      machine_type: n1-standard-32
-      on_host_maintenance: TERMINATE
-      instance_image: $(vars.instance_image_vm)
-      guest_accelerator:
-      - type: nvidia-tesla-t4
-        count: 1
-
-  - id: auto-highgpu
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: auto-highgpu
-      machine_type: a2-highgpu-1g
-      instance_image: $(vars.instance_image_vm)
-
-  - id: manual-highgpu
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: manual-highgpu
-      machine_type: a2-highgpu-2g
-      instance_image: $(vars.instance_image_vm)
-      guest_accelerator:
-      - type: nvidia-tesla-a100
-        count: 2
-
-  - id: auto-ultragpu
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: auto-ultragpu
-      machine_type: a2-ultragpu-2g
-      instance_image: $(vars.instance_image_vm)
-
-  - id: manual-ultragpu
-    source: modules/compute/vm-instance
-    use:
-    - network1
-    settings:
-      name_prefix: manual-ultragpu
-      machine_type: a2-ultragpu-2g
-      instance_image: $(vars.instance_image_vm)
-      guest_accelerator:
-      - type: nvidia-a100-80gb
-        count: 2
-
-- group: slurm-gcp-v5
-  modules:
-  - id: network_slurm
-    source: modules/network/pre-existing-vpc
-
-  - id: nogpu_nodegroup
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: nogpu
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-
-  - id: manual_nodegroup
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: man
-      node_count_dynamic_max: 4
-      machine_type: a2-ultragpu-2g
-      guest_accelerator:
-      - type: nvidia-a100-80gb
-        count: 2
-
-  - id: auto_nodegroup
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: auto
-      node_count_dynamic_max: 4
-      machine_type: a2-ultragpu-2g
-
-  - id: partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network_slurm
-    - nogpu_nodegroup
-    - manual_nodegroup
-    - auto_nodegroup
-    settings:
-      partition_name: debug
-      enable_placement: false
-      is_default: true
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network_slurm
-    - partition
-    settings:
-      disable_controller_public_ips: false
-      machine_type: a2-highgpu-2g
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network_slurm
-    - slurm_controller
-    settings:
-      disable_login_public_ips: false
-      machine_type: a2-highgpu-1g
diff --git a/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml b/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml
deleted file mode 100644
index 9dcd1332bc..0000000000
--- a/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: node-group-test-v5
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-gcp-v5
-  region: us-central1
-  zone: us-central1-c
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: community/modules/file-system/nfs-server
-    use: [network1]
-    settings:
-      local_mounts: [/home]
-      auto_delete_disk: true
-
-  ## Single node group, use defaults where appropriate
-  - id: default_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: simple
-      machine_type: c2-standard-30
-
-  - id: one_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - default_node_group
-    settings:
-      partition_name: simple
-
-  ## Complex partition using node groups
-  - id: node_group1
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: c30
-      machine_type: c2-standard-30
-      instance_image:
-        family: slurm-gcp-5-12-debian-11
-        project: schedmd-slurm-public
-      instance_image_custom: true
-
-  - id: node_group2
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: c60
-      machine_type: c2-standard-60
-      instance_image:
-        name: slurm-gcp-dev-hpc-centos-7-1684970018
-        project: schedmd-slurm-public
-
-  - id: node_group3
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: cd112
-      machine_type: c2d-standard-112
-      instance_image:
-        family: slurm-gcp-5-12-hpc-centos-7
-        project: schedmd-slurm-public
-      instance_image_custom: true
-      enable_smt: true
-
-  - id: node_group4
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      name: cd56
-      machine_type: c2d-standard-56
-
-  - id: multiple_node_groups
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - node_group1
-    - node_group2
-    - node_group3
-    - node_group4
-    settings:
-      partition_name: multng
-      enable_reconfigure: true
-
-  ## Explicitly set node partition with one node group
-  - id: one_node_group_explicit
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    settings:
-      partition_name: explng
-      enable_placement: false
-      is_default: true
-      node_groups:
-      - node_count_static: 0
-        node_count_dynamic_max: 4
-        group_name: expl
-        node_conf: {}
-        additional_disks: []
-        additional_networks: []
-        bandwidth_tier: null
-        can_ip_forward: false
-        disable_smt: false
-        disk_auto_delete: true
-        disk_labels: {}
-        disk_size_gb: 50
-        disk_type: pd-standard
-        enable_confidential_vm: false
-        enable_oslogin: true
-        enable_shielded_vm: false
-        enable_spot_vm: false
-        gpu: null
-        instance_template: null
-        labels: $(vars.labels)
-        machine_type: n2-standard-16
-        maintenance_interval: ""
-        metadata: {}
-        min_cpu_platform: null
-        on_host_maintenance: TERMINATE
-        preemptible: false
-        reservation_name: null # will be replaced by default value empty string
-        service_account: null
-        shielded_instance_config: null
-        spot_instance_config: null
-        source_image_family: null
-        source_image_project: null
-        source_image: null
-        tags: []
-        access_config: []
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - one_node_group
-    - multiple_node_groups
-    - one_node_group_explicit
-    - homefs
-    settings:
-      disable_controller_public_ips: false
-      enable_reconfigure: true
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-4
-      disable_login_public_ips: false
diff --git a/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts-v5-legacy.yaml b/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
deleted file mode 100644
index f15605b90d..0000000000
--- a/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts-v5-legacy.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: hpc-cluster-slurm-v5
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: hpc-small-v5
-  region: us-west4
-  zone: us-west4-c
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: community/modules/file-system/nfs-server
-    use: [network1]
-    settings:
-      local_mounts: [/home]
-      auto_delete_disk: true
-
-  - id: bucket
-    source: community/modules/file-system/cloud-storage-bucket
-    settings:
-      name_prefix: input-data
-      local_mount: /data
-      random_suffix: true
-      mount_options: defaults,_netdev,implicit_dirs,allow_other
-
-  # Used by the partitions, this tests startup scripts that are partition specific
-  - id: startup-partition
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: startup-test-partition.sh
-        content: |
-          #!/bin/bash
-          set -ex
-          echo "Hello partition! Hostname: \$(hostname)"
-
-  - id: debug_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-
-  - id: debug_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - bucket
-    - debug_node_group
-    settings:
-      partition_name: debug
-      enable_placement: false
-      is_default: true
-
-  - id: compute_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 20
-
-  - id: compute_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - bucket
-    - compute_node_group
-    - startup-partition
-    settings:
-      partition_name: compute
-
-  # Used by the login and controller, the controller applies it to all partitions as well.
-  - id: startup-all
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: startup-test-all.sh
-        content: |
-          #!/bin/bash
-          set -ex
-          echo "Hello world! Hostname: \$(hostname)"
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - debug_partition
-    - compute_partition
-    - homefs
-    - bucket
-    - startup-all
-    settings:
-      disable_controller_public_ips: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    - startup-all
-    settings:
-      disable_login_public_ips: false
diff --git a/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml b/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml
deleted file mode 100644
index 46e7eb37f0..0000000000
--- a/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: test-slurm-static-nodes-v5
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: test-v5
-  region: us-central1
-  zone: us-central1-a
-  machine_type: n1-standard-2
-  instance_image:
-    # Please refer to the following link for the latest images:
-    # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
-    # family: slurm-gcp-5-12-ubuntu-2004-lts
-    # family: slurm-gcp-5-12-hpc-centos-7
-    family: slurm-gcp-5-12-hpc-rocky-linux-8
-    # family: slurm-gcp-5-12-debian-11
-    project: schedmd-slurm-public
-  instance_image_custom: true
-  enable_reconfigure: true
-  enable_cleanup_compute: true
-  enable_cleanup_subscriptions: true
-  # num_dynamic_nodes: 2
-  num_static_nodes: 3
-
-deployment_groups:
-- group: primary
-  modules:
-
-  ###########
-  # Network #
-  ###########
-  - id: network1
-    source: modules/network/pre-existing-vpc
-
-  #############
-  # Slurm VMs #
-  #############
-  # - id: dynamic_node_group
-  #   source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-  #   settings:
-  #     node_count_dynamic_max: $(vars.num_dynamic_nodes)
-  #     machine_type: n2-standard-2
-
-  # - id: dynamic_partition
-  #   source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-  #   use:
-  #   - network1
-  #   - dynamic_node_group
-  #   settings:
-  #     partition_name: dynamic
-  #     # exclusive: false # allows nodes to stay up after jobs are done
-  #     enable_placement: false # the default is: true
-  #     # is_default: true
-
-  - id: static_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 0
-      node_count_static: $(vars.num_static_nodes)
-
-  - id: static_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - static_node_group
-    settings:
-      partition_name: static
-      enable_placement: false
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    # - dynamic_partition
-    - static_partition
-    settings:
-      disable_controller_public_ips: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      disable_login_public_ips: false
diff --git a/tools/validate_configs/test_configs/zone-policies-slurm-v5-legacy.yaml b/tools/validate_configs/test_configs/zone-policies-slurm-v5-legacy.yaml
deleted file mode 100644
index 0403fb7fdd..0000000000
--- a/tools/validate_configs/test_configs/zone-policies-slurm-v5-legacy.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: slurm-gcp-v5-hpc-centos7
-
-vars:
-  project_id:  ## Set GCP Project ID Here ##
-  deployment_name: slurm-gcp-v5
-  region: us-central1
-  zone: us-central1-c
-  additional_zones:
-  - us-central1-a
-  - us-central1-b
-
-# Documentation for each of the modules used below can be found at
-# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
-
-deployment_groups:
-- group: primary
-  modules:
-  - id: network1
-    source: modules/network/vpc
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [network1]
-    settings:
-      local_mount: /home
-
-  # Partition which permits a specific zone
-  - id: zonal_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-      disable_public_ips: false
-  - id: zonal_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - zonal_node_group
-    settings:
-      partition_name: zonal
-      enable_placement: false
-
-  # Partition which allows a total of 3 zones
-  - id: multizonal_node_group
-    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-    settings:
-      node_count_dynamic_max: 4
-      machine_type: n2-standard-2
-  - id: multizonal_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-    use:
-    - network1
-    - homefs
-    - multizonal_node_group
-    settings:
-      partition_name: multiz
-      enable_placement: false
-      zones: $(vars.additional_zones)
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use:
-    - network1
-    - homefs
-    - zonal_partition
-    - multizonal_partition
-    settings:
-      disable_controller_public_ips: false
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-    use:
-    - network1
-    - slurm_controller
-    settings:
-      machine_type: n2-standard-4
-      disable_login_public_ips: false

From d5f6312c9573ae41978cdd80fbedf6545283cea1 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar456@gmail.com>
Date: Mon, 6 Jan 2025 13:12:57 -0800
Subject: [PATCH 086/140] Revert "Revert use of toolkit_modules_url in
 examples"

---
 community/examples/tutorial-starccm-slurm.yaml                 | 2 ++
 docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
index ebf52861ff..9e64014ea7 100644
--- a/community/examples/tutorial-starccm-slurm.yaml
+++ b/community/examples/tutorial-starccm-slurm.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: starccm-on-slurm
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## Set GCP Project ID Here ##
diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
index 45312348ed..813a90f0b6 100644
--- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: hpc-cluster-hybrid-v5
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## <<bursting project (Project B)>>

From fb53d148227ba85eac7c0e0e60417232c8f31748 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Mon, 6 Jan 2025 13:25:51 -0800
Subject: [PATCH 087/140] Update tutorial-starccm-slurm.yaml

---
 community/examples/tutorial-starccm-slurm.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
index 9e64014ea7..e3e50b013d 100644
--- a/community/examples/tutorial-starccm-slurm.yaml
+++ b/community/examples/tutorial-starccm-slurm.yaml
@@ -24,6 +24,23 @@ vars:
   region: us-central1
   zone: us-central1-c
 
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
+  google-beta:
+    source: hashicorp/google-beta
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
 # Documentation for each of the modules used below can be found at
 # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
 

From 7f1583c1874cf421ee6c674280ddf2cae7e0b357 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Mon, 6 Jan 2025 13:26:10 -0800
Subject: [PATCH 088/140] Update hybrid-configuration.yaml

---
 .../blueprints/hybrid-configuration.yaml        | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
index 813a90f0b6..0f96ec1ac5 100644
--- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
@@ -28,6 +28,23 @@ vars:
   network_name: compute-vpc-network
   subnetwork_name: primary-subnet
 
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
+  google-beta:
+    source: hashicorp/google-beta
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
 deployment_groups:
 # Uncomment the below section if network used for bursting has not been created
 # - group: create_network

From 1b65000d1c2c884e11108c58a69f38d24713d68c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:19:17 +0000
Subject: [PATCH 089/140] Bump github.com/go-git/go-git/v5 from 5.12.0 to
 5.13.1

Bumps [github.com/go-git/go-git/v5](https://github.com/go-git/go-git) from 5.12.0 to 5.13.1.
- [Release notes](https://github.com/go-git/go-git/releases)
- [Commits](https://github.com/go-git/go-git/compare/v5.12.0...v5.13.1)

---
updated-dependencies:
- dependency-name: github.com/go-git/go-git/v5
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 go.mod |  6 +++---
 go.sum | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/go.mod b/go.mod
index 6e9502407e..193ad2ba2d 100644
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,7 @@ go 1.22
 
 require (
 	cloud.google.com/go/storage v1.41.0 // indirect
-	github.com/go-git/go-git/v5 v5.12.0
+	github.com/go-git/go-git/v5 v5.13.1
 	github.com/hashicorp/go-getter v1.7.6
 	github.com/hashicorp/hcl v1.0.0 // indirect
 	github.com/hashicorp/hcl/v2 v2.23.0
@@ -63,7 +63,7 @@ require (
 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
 	cloud.google.com/go/iam v1.1.8 // indirect
 	github.com/Microsoft/go-winio v0.6.1 // indirect
-	github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect
+	github.com/ProtonMail/go-crypto v1.1.3 // indirect
 	github.com/agext/levenshtein v1.2.3
 	github.com/aws/aws-sdk-go v1.44.122 // indirect
 	github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect
@@ -90,7 +90,7 @@ require (
 	github.com/mitchellh/go-wordwrap v1.0.1 // indirect
 	github.com/pjbgf/sha1cd v0.3.0 // indirect
 	github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect
-	github.com/skeema/knownhosts v1.2.2 // indirect
+	github.com/skeema/knownhosts v1.3.0 // indirect
 	github.com/spf13/pflag v1.0.5
 	github.com/ulikunitz/xz v0.5.10 // indirect
 	github.com/xanzy/ssh-agent v0.3.3 // indirect
diff --git a/go.sum b/go.sum
index f976fd23a8..506041e49b 100644
--- a/go.sum
+++ b/go.sum
@@ -195,8 +195,8 @@ github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v
 github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
 github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
-github.com/ProtonMail/go-crypto v1.1.0-alpha.2 h1:bkyFVUP+ROOARdgCiJzNQo2V2kiB97LyUpzH9P6Hrlg=
-github.com/ProtonMail/go-crypto v1.1.0-alpha.2/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE=
+github.com/ProtonMail/go-crypto v1.1.3 h1:nRBOetoydLeUb4nHajyO2bKqMLfWQ/ZPwkXqXxPxCFk=
+github.com/ProtonMail/go-crypto v1.1.3/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE=
 github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
 github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
 github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8=
@@ -237,8 +237,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a h1:mATvB/9r/3gvcejNsXKSkQ6lcIaNec2nyfOdlTBR2lU=
-github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM=
+github.com/elazarl/goproxy v1.2.3 h1:xwIyKHbaP5yfT6O9KIeYJR5549MXRQkoQMRXGztz8YQ=
+github.com/elazarl/goproxy v1.2.3/go.mod h1:YfEbZtqP4AetfO6d40vWchF3znWX7C7Vd6ZMfdL8z64=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
@@ -257,16 +257,16 @@ github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/gliderlabs/ssh v0.3.7 h1:iV3Bqi942d9huXnzEF2Mt+CY9gLu8DNM4Obd+8bODRE=
-github.com/gliderlabs/ssh v0.3.7/go.mod h1:zpHEXBstFnQYtGnB8k8kQLol82umzn/2/snG7alWVD8=
+github.com/gliderlabs/ssh v0.3.8 h1:a4YXD1V7xMF9g5nTkdfnja3Sxy1PVDCj1Zg4Wb8vY6c=
+github.com/gliderlabs/ssh v0.3.8/go.mod h1:xYoytBv1sV0aL3CavoDuJIQNURXkkfPA/wxQ1pL1fAU=
 github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI=
 github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic=
 github.com/go-git/go-billy/v5 v5.6.1 h1:u+dcrgaguSSkbjzHwelEjc0Yj300NUevrrPphk/SoRA=
 github.com/go-git/go-billy/v5 v5.6.1/go.mod h1:0AsLr1z2+Uksi4NlElmMblP5rPcDZNRCD8ujZCRR2BE=
 github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4=
 github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII=
-github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys=
-github.com/go-git/go-git/v5 v5.12.0/go.mod h1:FTM9VKtnI2m65hNI/TenDDDnUf2Q9FHnXYjuz9i5OEY=
+github.com/go-git/go-git/v5 v5.13.1 h1:DAQ9APonnlvSWpvolXWIuV6Q6zXy2wHbN4cVlNR5Q+M=
+github.com/go-git/go-git/v5 v5.13.1/go.mod h1:qryJB4cSBoq3FRoBRf5A77joojuBcmPJ0qu3XXXVixc=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
@@ -464,8 +464,8 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8=
 github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
 github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
-github.com/skeema/knownhosts v1.2.2 h1:Iug2P4fLmDw9f41PB6thxUkNUkJzB5i+1/exaj40L3A=
-github.com/skeema/knownhosts v1.2.2/go.mod h1:xYbVRSPxqBZFrdmDyMmsOs+uX1UZC3nTN3ThzgDxUwo=
+github.com/skeema/knownhosts v1.3.0 h1:AM+y0rI04VksttfwjkSTNQorvGqmwATnvnAHpSgc0LY=
+github.com/skeema/knownhosts v1.3.0/go.mod h1:sPINvnADmT/qYH1kfv+ePMmOBTH6Tbl7b5LvTDjFK7M=
 github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
 github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
 github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=

From 63dd3a38e987586d54267c258ce3834aed898717 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 6 Jan 2025 22:19:37 +0000
Subject: [PATCH 090/140] Bump golang.org/x/sys from 0.28.0 to 0.29.0

Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.28.0 to 0.29.0.
- [Commits](https://github.com/golang/sys/compare/v0.28.0...v0.29.0)

---
updated-dependencies:
- dependency-name: golang.org/x/sys
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 go.mod | 2 +-
 go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index 6e9502407e..f99e4240fa 100644
--- a/go.mod
+++ b/go.mod
@@ -98,7 +98,7 @@ require (
 	golang.org/x/crypto v0.31.0 // indirect
 	golang.org/x/net v0.33.0 // indirect
 	golang.org/x/oauth2 v0.21.0 // indirect
-	golang.org/x/sys v0.28.0
+	golang.org/x/sys v0.29.0
 	golang.org/x/text v0.21.0 // indirect
 	google.golang.org/grpc v1.64.1 // indirect
 	google.golang.org/protobuf v1.34.2 // indirect
diff --git a/go.sum b/go.sum
index f976fd23a8..92d2312558 100644
--- a/go.sum
+++ b/go.sum
@@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
-golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU=
+golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=

From f3ffd14b42b6bd5a666eae7509d25d8a14a8248b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 Jan 2025 00:14:29 +0000
Subject: [PATCH 091/140] Bump jinja2 from 3.1.4 to 3.1.5 in
 /community/front-end/ofe

Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.4 to 3.1.5.
- [Release notes](https://github.com/pallets/jinja/releases)
- [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/jinja/compare/3.1.4...3.1.5)

---
updated-dependencies:
- dependency-name: jinja2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 community/front-end/ofe/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt
index 22ed40cfac..efaabed2a5 100644
--- a/community/front-end/ofe/requirements.txt
+++ b/community/front-end/ofe/requirements.txt
@@ -46,7 +46,7 @@ identify==2.5.24
 idna==3.7
 importlib-resources==6.1.1
 isort==5.12.0
-Jinja2==3.1.4
+Jinja2==3.1.5
 jsonschema==4.20.0
 jsonschema-specifications==2023.11.1
 lazy-object-proxy==1.9.0

From c290e52cf4dfbc9bf3e5c99684a24edf73f9cbf4 Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Mon, 16 Dec 2024 21:16:41 +0000
Subject: [PATCH 092/140] Adding max_distance variable

---
 .../schedmd-slurm-gcp-v6-nodeset/README.md    |  1 +
 .../schedmd-slurm-gcp-v6-nodeset/main.tf      |  1 +
 .../schedmd-slurm-gcp-v6-nodeset/outputs.tf   | 10 +++++++
 .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 12 ++++++++
 .../schedmd-slurm-gcp-v6-controller/README.md |  2 +-
 .../modules/slurm_files/scripts/resume.py     | 30 +++++++++++++++----
 .../slurm_files/scripts/tests/common.py       |  1 +
 .../slurm_files/scripts/tests/test_resume.py  |  2 ++
 .../partition.tf                              |  1 +
 .../variables.tf                              |  1 +
 10 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
index 297c40bb7a..f79a9307b5 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
@@ -196,6 +196,7 @@ No modules.
 | <a name="input_node_count_dynamic_max"></a> [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `10` | no |
 | <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no |
 | <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.<br/><br/>Note: Placement groups are not supported when on\_host\_maintenance is set to<br/>"MIGRATE" and will be deactivated regardless of the value of<br/>enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is<br/>set to "TERMINATE". | `string` | `"TERMINATE"` | no |
+| <a name="input_placement_max_distance"></a> [placement\_max\_distance](#input\_placement\_max\_distance) | Maximum distance between nodes in the placement group. Requires enable\_placement to be true. Values must be supported by the chosen machine type. | `number` | `null` | no |
 | <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
 | <a name="input_region"></a> [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
index 84cb60457a..eca10e9d1a 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
@@ -71,6 +71,7 @@ locals {
 
     enable_confidential_vm = var.enable_confidential_vm
     enable_placement       = var.enable_placement
+    placement_max_distance = var.placement_max_distance
     enable_oslogin         = var.enable_oslogin
     enable_shielded_vm     = var.enable_shielded_vm
     gpu                    = one(local.guest_accelerator)
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
index 5781d2415c..d618644d52 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
@@ -45,6 +45,16 @@ output "nodeset" {
     error_message = "Cannot use placement with static and auto-scaling nodes in the same node set."
   }
 
+  precondition {
+    condition     = var.placement_max_distance == null || var.enable_placement
+    error_message = "placement_max_distance requires enable_placement to be set to true."
+  }
+
+  precondition {
+    condition     = !(startswith(var.machine_type, "a3-") && var.placement_max_distance == 1)
+    error_message = "A3 machines do not support a placement_max_distance of 1."
+  }
+
   precondition {
     condition     = var.reservation_name == "" || !var.dws_flex.enabled
     error_message = "Cannot use reservations with DWS Flex."
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
index 3b7e342c32..82adca0b1b 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
@@ -565,3 +565,15 @@ variable "dws_flex" {
     error_message = "Max duration must be more than 30 seconds, and cannot be more than two weeks."
   }
 }
+
+variable "placement_max_distance" {
+  type        = number
+  description = "Maximum distance between nodes in the placement group. Requires enable_placement to be true. Values must be supported by the chosen machine type."
+  nullable    = true
+  default     = null
+
+  validation {
+    condition     = coalesce(var.placement_max_distance, 1) >= 1 && coalesce(var.placement_max_distance, 3) <= 3
+    error_message = "Invalid value for placement_max_distance. Valid values are null, 1, 2, or 3."
+  }
+}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index b03fbf0973..d485cb1765 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -336,7 +336,7 @@ limitations under the License.
 | <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no |
 | <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of<br/>CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:<br/>https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = optional(map(string))<br/>    mount_runner          = optional(map(string))<br/>  }))</pre> | `[]` | no |
-| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Define nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 1)<br/>    node_conf              = optional(map(string), {})<br/>    nodeset_name           = string<br/>    additional_disks = optional(list(object({<br/>      disk_name    = optional(string)<br/>      device_name  = optional(string)<br/>      disk_size_gb = optional(number)<br/>      disk_type    = optional(string)<br/>      disk_labels  = optional(map(string), {})<br/>      auto_delete  = optional(bool, true)<br/>      boot         = optional(bool, false)<br/>    })), [])<br/>    bandwidth_tier                   = optional(string, "platform_default")<br/>    can_ip_forward                   = optional(bool, false)<br/>    disable_smt                      = optional(bool, false)<br/>    disk_auto_delete                 = optional(bool, true)<br/>    disk_labels                      = optional(map(string), {})<br/>    disk_size_gb                     = optional(number)<br/>    disk_type                        = optional(string)<br/>    enable_confidential_vm           = optional(bool, false)<br/>    enable_placement                 = optional(bool, false)<br/>    enable_oslogin                   = optional(bool, true)<br/>    enable_shielded_vm               = optional(bool, false)<br/>    enable_maintenance_reservation   = optional(bool, false)<br/>    enable_opportunistic_maintenance = optional(bool, false)<br/>    gpu = optional(object({<br/>      count = number<br/>      type  = string<br/>    }))<br/>    dws_flex = object({<br/>      enabled          = bool<br/>      max_run_duration = number<br/>      use_job_duration = bool<br/>    })<br/>    labels                   = optional(map(string), {})<br/>    machine_type             = optional(string)<br/>    maintenance_interval     = optional(string)<br/>    instance_properties_json = string<br/>    metadata                 = optional(map(string), {})<br/>    min_cpu_platform         = optional(string)<br/>    network_tier             = optional(string, "STANDARD")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    on_host_maintenance = optional(string)<br/>    preemptible         = optional(bool, false)<br/>    region              = optional(string)<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    shielded_instance_config = optional(object({<br/>      enable_integrity_monitoring = optional(bool, true)<br/>      enable_secure_boot          = optional(bool, true)<br/>      enable_vtpm                 = optional(bool, true)<br/>    }))<br/>    source_image_family  = optional(string)<br/>    source_image_project = optional(string)<br/>    source_image         = optional(string)<br/>    subnetwork_self_link = string<br/>    additional_networks = optional(list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    })))<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })))<br/>    spot               = optional(bool, false)<br/>    tags               = optional(list(string), [])<br/>    termination_action = optional(string)<br/>    reservation_name   = optional(string)<br/>    future_reservation = string<br/>    startup_script = optional(list(object({<br/>      filename = string<br/>    content = string })), [])<br/><br/>    zone_target_shape = string<br/>    zone_policy_allow = set(string)<br/>    zone_policy_deny  = set(string)<br/>  }))</pre> | `[]` | no |
+| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Define nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 1)<br/>    node_conf              = optional(map(string), {})<br/>    nodeset_name           = string<br/>    additional_disks = optional(list(object({<br/>      disk_name    = optional(string)<br/>      device_name  = optional(string)<br/>      disk_size_gb = optional(number)<br/>      disk_type    = optional(string)<br/>      disk_labels  = optional(map(string), {})<br/>      auto_delete  = optional(bool, true)<br/>      boot         = optional(bool, false)<br/>    })), [])<br/>    bandwidth_tier                   = optional(string, "platform_default")<br/>    can_ip_forward                   = optional(bool, false)<br/>    disable_smt                      = optional(bool, false)<br/>    disk_auto_delete                 = optional(bool, true)<br/>    disk_labels                      = optional(map(string), {})<br/>    disk_size_gb                     = optional(number)<br/>    disk_type                        = optional(string)<br/>    enable_confidential_vm           = optional(bool, false)<br/>    enable_placement                 = optional(bool, false)<br/>    placement_max_distance           = optional(number, null)<br/>    enable_oslogin                   = optional(bool, true)<br/>    enable_shielded_vm               = optional(bool, false)<br/>    enable_maintenance_reservation   = optional(bool, false)<br/>    enable_opportunistic_maintenance = optional(bool, false)<br/>    gpu = optional(object({<br/>      count = number<br/>      type  = string<br/>    }))<br/>    dws_flex = object({<br/>      enabled          = bool<br/>      max_run_duration = number<br/>      use_job_duration = bool<br/>    })<br/>    labels                   = optional(map(string), {})<br/>    machine_type             = optional(string)<br/>    maintenance_interval     = optional(string)<br/>    instance_properties_json = string<br/>    metadata                 = optional(map(string), {})<br/>    min_cpu_platform         = optional(string)<br/>    network_tier             = optional(string, "STANDARD")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    on_host_maintenance = optional(string)<br/>    preemptible         = optional(bool, false)<br/>    region              = optional(string)<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    shielded_instance_config = optional(object({<br/>      enable_integrity_monitoring = optional(bool, true)<br/>      enable_secure_boot          = optional(bool, true)<br/>      enable_vtpm                 = optional(bool, true)<br/>    }))<br/>    source_image_family  = optional(string)<br/>    source_image_project = optional(string)<br/>    source_image         = optional(string)<br/>    subnetwork_self_link = string<br/>    additional_networks = optional(list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    })))<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })))<br/>    spot               = optional(bool, false)<br/>    tags               = optional(list(string), [])<br/>    termination_action = optional(string)<br/>    reservation_name   = optional(string)<br/>    future_reservation = string<br/>    startup_script = optional(list(object({<br/>      filename = string<br/>    content = string })), [])<br/><br/>    zone_target_shape = string<br/>    zone_policy_allow = set(string)<br/>    zone_policy_deny  = set(string)<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_dyn"></a> [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. | <pre>list(object({<br/>    nodeset_name    = string<br/>    nodeset_feature = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_tpu"></a> [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 5)<br/>    nodeset_name           = string<br/>    enable_public_ip       = optional(bool, false)<br/>    node_type              = string<br/>    accelerator_config = optional(object({<br/>      topology = string<br/>      version  = string<br/>      }), {<br/>      topology = ""<br/>      version  = ""<br/>    })<br/>    tf_version   = string<br/>    preemptible  = optional(bool, false)<br/>    preserve_tpu = optional(bool, false)<br/>    zone         = string<br/>    data_disks   = optional(list(string), [])<br/>    docker_image = optional(string, "")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    subnetwork = string<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    project_id = string<br/>    reserved   = optional(string, false)<br/>  }))</pre> | `[]` | no |
 | <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index fa5413e53c..de09c9fa12 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -444,12 +444,13 @@ def hold_job(job_id, reason):
     run(f"{lookup().scontrol} update jobid={job_id} comment='{reason}'")
 
 
-def create_placement_request(pg_name, region):
+def create_placement_request(pg_name: str, region: str, max_distance: Optional[int]):
     config = {
         "name": pg_name,
         "region": region,
         "groupPlacementPolicy": {
             "collocation": "COLLOCATED",
+            "maxDistance": max_distance
         },
     }
     if lookup().cfg.enable_slurm_gcp_plugins:
@@ -489,11 +490,13 @@ def _allocate_nodes_to_placements(nodes: List[str], excl_job_id:Optional[int], l
     if not (nodeset.enable_placement and valid_placement_node(model)):
         return no_pp
     
+    max_count = calculate_chunk_size(nodeset, lkp)
+
     name_prefix = f"{lkp.cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}"
     if excl_job_id: # simply chunk given nodes by max size of placement
         return [
             PlacementAndNodes(placement=f"{name_prefix}-{excl_job_id}-{i}", nodes=chunk)
-            for i, chunk in enumerate(chunked(nodes, n=PLACEMENT_MAX_CNT))
+            for i, chunk in enumerate(chunked(nodes, n=max_count))
         ]
 
     # split whole nodeset (not only nodes to resume) into chunks of max size of placement
@@ -503,7 +506,7 @@ def _allocate_nodes_to_placements(nodes: List[str], excl_job_id:Optional[int], l
 
     for node in nodes:
         try:
-            chunk = lkp.node_index(node) // PLACEMENT_MAX_CNT
+            chunk = lkp.node_index(node) // max_count
             chunks[chunk].append(node)
         except:
             invalid.append(node)
@@ -520,18 +523,35 @@ def _allocate_nodes_to_placements(nodes: List[str], excl_job_id:Optional[int], l
 
     return placements
 
+def calculate_chunk_size(nodeset: NSDict, lkp: util.Lookup) -> int:
+    # Calculates the chunk size based on max distance value received
+    machine_type = lkp.template_info(nodeset.instance_template).machine_type.family
+    max_distance = nodeset.placement_max_distance
+    if max_distance == 1:
+        return 22
+    elif max_distance == 2:
+        if machine_type.startswith("a3"):
+            return 256
+        else:
+            return 150
+    elif max_distance == 3:
+        return 1500
+    else:
+        return PLACEMENT_MAX_CNT
+
 def create_nodeset_placements(nodes: List[str], excl_job_id:Optional[int], lkp: util.Lookup) -> List[PlacementAndNodes]:    
     placements = _allocate_nodes_to_placements(nodes, excl_job_id, lkp)
     region = lkp.node_region(nodes[0])
+    max_distance = lkp.node_nodeset(nodes[0]).get('placement_max_distance')
 
     if log.isEnabledFor(logging.DEBUG):
         debug_p = {p.placement: to_hostlist(p.nodes) for p in placements}
         log.debug(
             f"creating {len(placements)} placement groups: \n{yaml.safe_dump(debug_p).rstrip()}"
         )
-    
+
     requests = {
-        p.placement: create_placement_request(p.placement, region) for p in placements if p.placement
+        p.placement: create_placement_request(p.placement, region, max_distance) for p in placements if p.placement
     }
     if not requests:
         return placements
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
index 643712efa7..f8434168de 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
@@ -38,6 +38,7 @@ class TstNodeset:
     reservation_name: Optional[str] = ""
     zone_policy_allow: Optional[list[str]] = field(default_factory=list)
     enable_placement: bool = True
+    placement_max_distance: Optional[int] = None
 
 @dataclass
 class TstPartition:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
index 3c637bbe10..77f1229605 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_resume.py
@@ -170,4 +170,6 @@ def test_allocate_nodes_to_placements(nodes: list[str], excl_job_id: Optional[in
 
   with unittest.mock.patch("resume.valid_placement_node") as mock_valid_placement_node:
     mock_valid_placement_node.return_value = True
+    lkp.template_info = unittest.mock.Mock(return_value=unittest.mock.Mock(machine_type=unittest.mock.Mock(family="n1")))
+
     assert resume._allocate_nodes_to_placements(nodes, excl_job_id, lkp) == expected
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index 308b60d19d..f7e1c8b526 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -93,6 +93,7 @@ locals {
     maintenance_interval             = ns.maintenance_interval
     instance_properties_json         = ns.instance_properties_json
     enable_placement                 = ns.enable_placement
+    placement_max_distance           = ns.placement_max_distance
     network_storage                  = ns.network_storage
     zone_target_shape                = ns.zone_target_shape
     zone_policy_allow                = ns.zone_policy_allow
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
index 6264576b2c..8daa202afd 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
@@ -205,6 +205,7 @@ variable "nodeset" {
     disk_type                        = optional(string)
     enable_confidential_vm           = optional(bool, false)
     enable_placement                 = optional(bool, false)
+    placement_max_distance           = optional(number, null)
     enable_oslogin                   = optional(bool, true)
     enable_shielded_vm               = optional(bool, false)
     enable_maintenance_reservation   = optional(bool, false)

From 0b217d0efcc9d8f09ec27f402c2ce500fd7061e9 Mon Sep 17 00:00:00 2001
From: Parul Bajaj <parulbajaj@google.com>
Date: Tue, 7 Jan 2025 05:11:24 +0000
Subject: [PATCH 093/140] Update A3U blueprint to remove MTU var

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 3037132c21..20699334df 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -28,7 +28,6 @@ vars:
   nccl_installer_path: $(ghpc_stage("./nccl-installer.yaml"))
   # Temporary fix for COS issue, will be fixed in next release
   mglru_disable_path: $(ghpc_stage("./mglru-disable.yaml"))
-  mtu_size: 8896
   static_node_count:  # add this
   system_node_pool_disk_size_gb: 200
   a3ultra_node_pool_disk_size_gb: 100
@@ -81,7 +80,7 @@ deployment_groups:
     source: modules/network/vpc
     settings:
       network_name: $(vars.deployment_name)-net-1
-      mtu: $(vars.mtu_size)
+      mtu: 8896
       subnetworks:
       - subnet_name: $(vars.deployment_name)-sub-1
         subnet_region: $(vars.region)
@@ -100,7 +99,7 @@ deployment_groups:
     source: modules/network/gpu-rdma-vpc
     settings:
       network_name: $(vars.deployment_name)-rdma-net
-      mtu: $(vars.mtu_size)
+      mtu: 8896
       network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
       network_routing_mode: REGIONAL
       subnetworks_template:

From d357df767f5335b0c43a540abae671f227f98657 Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Tue, 7 Jan 2025 09:13:12 +0000
Subject: [PATCH 094/140] Remove max_hops plugin

---
 .../schedmd-slurm-gcp-v6-controller/README.md | 20 +++---
 .../slurm_gcp_plugins/max_hops/README.md      | 38 ----------
 .../slurm_gcp_plugins/max_hops/__init__.py    | 72 -------------------
 .../modules/slurm_files/variables.tf          |  4 ++
 .../variables.tf                              |  4 ++
 5 files changed, 17 insertions(+), 121 deletions(-)
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index 99078dbcce..74b3534c42 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -151,23 +151,21 @@ flag which can used to control the maximum spreading allowed. Read more about
 [official docs](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies
 ).
 
-You can use the `enable_slurm_gcp_plugins.max_hops.max_hops` setting on the
-controller module to control the `max-distance` behavior. See the following
-example:
+You can use the `placement_max_distance` setting on the nodeset module to control the `max-distance` behavior. See the following example:
 
 ```yaml
-  - id: controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
-    use: [ network, partition ]
+  - id: nodeset
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [ network ]
     settings:
-      enable_slurm_gcp_plugins:
-        max_hops:
-          max_hops: 1
-```
+      machine_type: c2-standard-4
+      node_count_dynamic_max: 30
+      enable_placement: true
+      placement_max_distance: 1
 
 > [!NOTE]
 > `schedmd-slurm-gcp-v6-nodeset.settings.enable_placement: true` must also be
-> set for max-distance to take effect.
+> set for placement_max_distance to take effect.
 
 In the above case using a value of 1 will restrict VM to be placed on the same
 rack. You can confirm that the `max-distance` was applied by calling the
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md
deleted file mode 100644
index 9e8ad4afeb..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# max_hops slurm_gcp_plugin plugin
-
-## Overview
-
-This plugin allows placement parameters to be set controlling the max number of
-network hops between nodes in dynamic jobs.
-
-## Usage
-
-### Configuration
-
-This plugin can be enabled by adding the following to the slurm-gcp config:
-
-```yaml
-enable_slurm_gcp_plugins:
-  #possibly other plugins
-  max_hops:
-    max_hops: 1
-```
-
-to set the default max_hops to, in this example, 1 for _all_ jobs.
-
-### Per job setting
-
-The max hops setting can be changed on a per job basis using the --prefer
-argument e.g. as follows:
-
-salloc --prefer=max_hops.max_hops=1
-
-to allow at most one network hop. For this to work the
-`ignore_prefer_validation` needs to be added to the slurm `SchedulerParameters`
-configuration item.
-
-## Callbacks used
-
-### pre_placement_group_insert
-
-Used to change the placement group creation request.
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py
deleted file mode 100644
index 6e1f8dfae7..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2024 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import sys
-import slurm_gcp_plugins.utils as sgp_utils
-
-# Allows setting a specific max_hop for jobs
-#
-# To enable:
-# * add this directory to the slurm-gcp plugin path (usually /slurm/scripts/slurm-gcp-plugins)
-# * add the following to the slurm-gcp config (usually /slurm/scripts/config.yaml):
-#
-# enable_slurm_gcp_plugins:
-#  <possibly other plugins>
-#  max_hops:
-#    max_hops: <hops>
-#
-#
-# Where <hps> can be either of 1,2,3 (in increasing order of distance)
-# If no max_hops is provided but the plugins is still enabled the default level is 3
-
-
-def pre_placement_group_insert(*pos_args, **keyword_args):
-    logging.info("Trying to enable max hop")
-    # Avoid circular import (util imports the plugins)
-    if "util" in sys.modules:
-        logging.info("Setting compute service version to beta")
-        sys.modules["util"].compute = sys.modules["util"].compute_service(
-            version="beta"
-        )
-        max_distance = sgp_utils.get_plugin_setting(
-            plugin="max_hops",
-            setting="max_hops",
-            job=get_job_from_placement_group_name(keyword_args["pg_name"]),
-            lkp=keyword_args["lkp"],
-            default=3,
-        )
-        logging.debug(f"Setting max hop for placement policy to {max_distance}")
-        keyword_args["request_body"]["groupPlacementPolicy"][
-            "collocation="
-        ] = "COLLOCATED"
-        keyword_args["request_body"]["groupPlacementPolicy"][
-            "maxDistance"
-        ] = max_distance
-    else:
-        logging.error(
-            "max_hops can not be set (slurm_gcp util.py must be imported by the caller of the plugin callback)"
-        )
-
-
-__all__ = [
-    "pre_placement_group_insert",
-]
-
-
-# This should be replaced if the job id becomes available in the context of this plugin hook
-def get_job_from_placement_group_name(pg_name):
-    # f"{cfg.slurm_cluster_name}-{partition_name}-{job_id}-{i}"
-
-    return pg_name.split("-")[2]
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
index 308a42e639..653e7d74ca 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
@@ -64,6 +64,10 @@ Enables calling hooks in scripts/slurm_gcp_plugins during cluster resume and sus
 EOD
   type        = any
   default     = false
+  validation {
+    condition     = !can(var.enable_slurm_gcp_plugins.max_hops)
+    error_message = "The 'max_hops' plugin is no longer supported. Please use the 'placement_max_distance' nodeset property instead."
+  }
 }
 
 variable "enable_bigquery_load" {
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
index 6264576b2c..8d47995d31 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
@@ -620,6 +620,10 @@ Enables calling hooks in scripts/slurm_gcp_plugins during cluster resume and sus
 EOD
   type        = any
   default     = false
+  validation {
+    condition     = !can(var.enable_slurm_gcp_plugins.max_hops)
+    error_message = "The 'max_hops' plugin is no longer supported. Please use the 'placement_max_distance' nodeset property instead."
+  }
 }
 
 variable "universe_domain" {

From 6cfaeed31d2d94d7f035fb1d38e579bd1c4dccf9 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Tue, 7 Jan 2025 10:58:50 +0000
Subject: [PATCH 095/140] Freeze gcluster version for blueprints which were not
 migrated to slurm-gcp v6

---
 .../blueprints/static-cluster.yaml            | 19 +++++++++++++++++++
 .../test_configs/two-clusters-sql.yaml        | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml
index 5abb581eaf..ce0b68dc1b 100644
--- a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: static-slurm-cluster
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## <<PROJECT_A>>
@@ -22,6 +24,23 @@ vars:
   region: us-central1
   zone: us-central1-c
 
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
+  google-beta:
+    source: hashicorp/google-beta
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
 deployment_groups:
 - group: primary
   modules:
diff --git a/tools/validate_configs/test_configs/two-clusters-sql.yaml b/tools/validate_configs/test_configs/two-clusters-sql.yaml
index 56c46200d3..ab6f71c302 100644
--- a/tools/validate_configs/test_configs/two-clusters-sql.yaml
+++ b/tools/validate_configs/test_configs/two-clusters-sql.yaml
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 blueprint_name: two-clusters
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id: ## Set GCP Project ID Here ##
@@ -25,6 +27,23 @@ vars:
   enable_bigquery_load: True
   instance_image_custom: True
 
+terraform_providers:
+  google:
+    source: hashicorp/google
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
+  google-beta:
+    source: hashicorp/google-beta
+    version: 5.45.0
+    configuration:
+      project: $(vars.project_id)
+      region: $(vars.region)
+      zone: $(vars.zone)
+
 deployment_groups:
 - group: net
   modules:

From 682e29737f1cd2b7c9f6e35acf159e2e487c2a7e Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:37:39 -0800
Subject: [PATCH 096/140] Update nccl-installer.yaml

---
 examples/gke-a3-ultragpu/nccl-installer.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/gke-a3-ultragpu/nccl-installer.yaml b/examples/gke-a3-ultragpu/nccl-installer.yaml
index 7f28f673df..0227658184 100644
--- a/examples/gke-a3-ultragpu/nccl-installer.yaml
+++ b/examples/gke-a3-ultragpu/nccl-installer.yaml
@@ -73,6 +73,7 @@ spec:
           /scripts/container_entry.sh install --install-nccl
           cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
           cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+          ibv_devinfo || exit 1
           echo "installation finishes"
       containers:
       - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"

From cd11d0291c0087e3b294504b2103f147a09ba77d Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:38:11 -0800
Subject: [PATCH 097/140] Delete
 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md

---
 .../a3u-slurm-ubuntu-gcs/README.md            | 153 ------------------
 1 file changed, 153 deletions(-)
 delete mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md

diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md
deleted file mode 100644
index 7f0c062080..0000000000
--- a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md
+++ /dev/null
@@ -1,153 +0,0 @@
-# A3-Ultra Slurm + Ubuntu + GCS
-
-This reference design creates a Slurm cluster with the following design:
-
-1. Ubuntu 22 Operating System
-1. A static a3-ultragpu-8g partition that uses a reservation.
-1. 3 VPCs (2x CPU, 1x for GPU RDMA networks), with a total of 9 subnetworks
-1. A GCS bucket that is configured with Hierarchical Namespace enabled
-1. Cloud Storage Fuse, configured to utilize Local-SSD storage
-
-## Deployment Instructions
-
-### Build the Cluster Toolkit gcluster binary
-
-Follow instructions
-[here](https://cloud.google.com/cluster-toolkit/docs/setup/configure-environment)
-
-### (Optional, but recommended) Create a GCS Bucket for storing terraform state
-
-```bash
-#!/bin/bash
-
-TF_STATE_BUCKET_NAME=<your-bucket>
-PROJECT_ID=<your-gcp-project>
-REGION=<your-preferred-region>
-
-gcloud storage buckets create gs://${TF_STATE_BUCKET_NAME} \
-    --project=${PROJECT_ID} \
-    --default-storage-class=STANDARD --location=${REGION} \
-    --uniform-bucket-level-access
-gcloud storage buckets update gs://${TF_STATE_BUCKET_NAME} --versioning
-```
-
-### Create and configure a GCS Bucket
-
-This will be used for input data and checkpoint/restart data. This bucket should
-be created with Hierarchical Namespace enabled. See
-[here](https://cloud.google.com/storage/docs/hns-overview) for more details.
-
-```bash
-#!/bin/bash
-PROJECT_ID=<your-gcp-project>
-REGION=<your-preferred-region>
-HNS_BUCKET_NAME=<training-bucket-name>
-PROJECT_NUMER=<your-project-number>
-
-gcloud storage buckets create gs://${HNS_BUCKET_NAME} \
-    --location=${REGION} --uniform-bucket-level-access
-    --enable-hierarchical-namespace
-
-```
-
-### Create/modify the deployment.yaml file with your preferred configuration
-
-For example, set the such as size, reservation to be used, etc, as well as the
-name of the bucket that you just created. Below is an example
-
-```yaml
----
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: TF_STATE_BUCKET_NAME
-
-vars:
-  deployment_name: a3u-gcs
-  project_id: <PROJECT_ID>
-  region: <REGION>
-  zone: <ZONE>
-  a3u_reservation_name: <RESERVATION_NAME>
-  a3u_cluster_size: <RESERVATION_SIZE>
-  hns_gcs_bucket: <HNS_BUCKET_NAME> # This bucket must have been previously created
-
-```
-
-### Deploy the cluster
-
-```bash
-#!/bin/bash
-gcluster deploy -d deployment.yaml a3u-slurm-ubuntu-gcs.yaml
-```
-
-## Storage Design Components
-
-On the login and controller nodes, the gcs bucket is mounted at /gcs, using
-fairly standard [Cloud Storage Fuse configuration](https://cloud.google.com/storage/docs/cloud-storage-fuse/config-file). On the compute nodes, there are two
-mounts of the same bucket.  First, `/gcs` is mounted with with the following
-configuration:
-
-```yaml
-file-cache:
-  max-size-mb: -1
-  enable-parallel-downloads: true
-  download-chunk-size-mb: 50
-  parallel-downloads-per-file: 16
-cache-dir: /mnt/localssd
-file-system:
-  dir-mode: "777"
-  file-mode: "777"
-  rename-dir-limit: 20000  # Set to 20000 for hierarchical buckets
-  temp-dir: /mnt/localssd
-  fuse-options: allow_other
-foreground: true
-```
-
-This uses /mnt/localssd as a cache dir (for reads) and temp-dir (for writes).
-It also enables parallel downloads, which is particularly useful for
-checkpoint restarts.
-
-Next, `/gcs-ro` is mounted in a "read-only" mode, and optimized to for
-input (training) data reading.
-
-```yaml
-file-cache:
-  max-size-mb: -1
-metadata-cache:
-  ttl-secs: 3600  # Decrease if your data changes quickly.
-cache-dir: /mnt/localssd
-file-system:
-  dir-mode: "755" # need 5 on dir to enable ls
-  file-mode: "644"
-  temp-dir: /mnt/localssd
-  fuse-options: allow_other
-  kernel-list-cache-ttl-secs: 60
-foreground: true
-```
-
-The local ssds will be used for a file cache, and the metadata-cache
-for the data is set to 1 hour, with kernel-list-cache ttl set to 60 seconds.
-This reduces the amount of requests that will be sent to GCS, and improves
-data loading performance.
-
-We suggest using /gcs for checkpoint saving/loading. and use /gcs-ro for
-data input loading.
-
-## Running Benchmarks with Ramble
-
-To run a series of NCCL test benchmarks on your cluster, you can use
-the use the following script: `run-nccl-tests-via-ramble.sh`,
-which will use [ramble](https://github.com/GoogleCloudPlatform/ramble) to
-automate the building and running of nccl tests from 2 nodes up to 32 node
-scales.
-
-Copy the contents of `run-nccl-tests-via-ramble.sh` to your slurm
-login or controller node, for example:
-
-```bash
-#!/bin/bash
-wget -np -nd https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
-```
-
-and then launch with `bash run-nccl-tests-via-ramble.sh`. The entire process
-will take ~30 minutes.

From b619d76f2b4f37f902f3ba2582c3ac70cb4ef6d0 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:38:41 -0800
Subject: [PATCH 098/140] Delete
 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml

---
 .../a3u-slurm-ubuntu-gcs.yaml                 | 615 ------------------
 1 file changed, 615 deletions(-)
 delete mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml

diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml
deleted file mode 100644
index 7be9f89a00..0000000000
--- a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml
+++ /dev/null
@@ -1,615 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: a3u-slurm-ubuntu-gcs
-
-vars:
-  # The following are supplied through the deployment.yaml file.
-  deployment_name: # supply deployment name
-  project_id: # supply project ID
-  region: # supply region
-  zone: # supply zone
-  a3u_cluster_size: # supply cluster size
-  a3u_reservation_name: # supply reservation name
-  hns_gcs_bucket: # Name of HNS enabled GCS bucket
-  # End of variables defined by deployment.yaml. The remainder
-  # of this blueprint need not be modified.
-
-  # Image settings
-  base_image:
-    project: ubuntu-os-accelerator-images
-    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
-  image_build_machine_type: n2-standard-16
-  build_slurm_from_git_ref: 6.8.6
-
-  # Cluster env settings
-  # net0 and filestore ranges must not overlap
-  net0_range: 192.168.0.0/19
-  filestore_ip_range: 192.168.32.0/24
-  net1_range: 192.168.64.0/18
-  rdma_net_range: 192.168.128.0/18
-
-  # Cluster Settings
-  local_ssd_mountpoint: /mnt/localssd
-  instance_image:
-    project: $(vars.project_id)
-    family: $(vars.deployment_name)-u22
-  disk_size_gb: 200
-  nccl_plugin_version: v1.0.2
-
-  # Here we define a set of startup script runners that are used to configure
-  # the controller node
-  controller_runners:
-  - type: shell
-    destination: stage_scripts.sh
-    content: |
-      #!/bin/bash
-      SLURM_ROOT=/opt/apps/adm/slurm
-      PARTITION_NAME=a3ultra
-      mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
-      mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
-      ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
-
-  # Shared runners between login and controller:
-  # Configure an enroot config path
-  shared_runners:
-  - type: data
-    destination: /etc/enroot/enroot.conf
-    content: |
-      ENROOT_CONFIG_PATH     ${HOME}/.enroot
-
-  # Here we define a set of startup script runners that are used to configure
-  # the A3-Ultra nodes
-  # Set up enroot, using the local ssds for runtime/cache/data/temp storage.
-  a3u_runners:
-  - type: data
-    destination: /etc/enroot/enroot.conf
-    content: |
-      ENROOT_CONFIG_PATH     ${HOME}/.enroot
-      ENROOT_RUNTIME_PATH    $(vars.local_ssd_mountpoint)/${UID}/enroot/runtime
-      ENROOT_CACHE_PATH      $(vars.local_ssd_mountpoint)/${UID}/enroot/cache
-      ENROOT_DATA_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot/data
-      ENROOT_TEMP_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot
-
-  # Install NCCL Network Plugin
-  - type: ansible-local
-    destination: nccl_plugin.yml
-    content: |
-      ---
-      - name: Install NCCL plugin for A3 Ultra series
-        hosts: all
-        become: true
-        tasks:
-        - name: Add SystemD unit for NCCL plugin installation
-          ansible.builtin.copy:
-            dest: /etc/systemd/system/nccl-plugin@.service
-            mode: 0o0644
-            content: |
-              [Unit]
-              After=network-online.target
-              Before=slurmd.service
-
-              [Service]
-              Type=oneshot
-              ExecStartPre=/usr/bin/rm -rf /usr/local/gib
-              ExecStartPre=/usr/bin/mkdir -p /usr/local/gib
-              ExecStartPre=/snap/bin/gcloud auth configure-docker --quiet us-docker.pkg.dev
-              ExecStart=/usr/bin/docker run --rm --name nccl-gib-installer --volume /usr/local/gib:/var/lib/gib \
-                  us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:%i install --install-nccl
-
-              [Install]
-              WantedBy=slurmd.service
-          notify:
-          - Reload SystemD
-        handlers:
-        - name: Reload SystemD
-          ansible.builtin.systemd:
-            daemon_reload: true
-        post_tasks:
-        - name: Enable NCCL plugin SystemD unit
-          ansible.builtin.service:
-            name: nccl-plugin@$(vars.nccl_plugin_version).service
-            state: started
-            enabled: true
-
-  # Configure Cloud Storage FUSE
-  - type: ansible-local
-    destination: gcsfuse.yml
-    content: |
-      ---
-      - name: Create LSSD optimized gcsfuse mount
-        hosts: all
-        become: true
-        tasks:
-        - name: Create gcsfuse rwx configuration
-          ansible.builtin.copy:
-            dest: /etc/gcsfuse-lssd.yml
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              file-cache:
-                max-size-mb: -1
-                enable-parallel-downloads: true
-                download-chunk-size-mb: 50
-                parallel-downloads-per-file: 16
-              cache-dir: /mnt/localssd
-              file-system:
-                dir-mode: "777"
-                file-mode: "777"
-                rename-dir-limit: 20000  # Set to 20000 for hierarchical buckets
-                temp-dir: /mnt/localssd
-                fuse-options: allow_other
-              foreground: true
-
-        - name: Create gcsfuse read-only configuration for input data
-          ansible.builtin.copy:
-            dest: /etc/gcsfuse-ro.yml
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              file-cache:
-                max-size-mb: -1
-              metadata-cache:
-                ttl-secs: 3600  # Decrease if your data changes quickly.
-              cache-dir: /mnt/localssd
-              file-system:
-                dir-mode: "755" # need 5 on dir to enable ls
-                file-mode: "644"
-                temp-dir: /mnt/localssd
-                fuse-options: allow_other
-                kernel-list-cache-ttl-secs: 60
-              foreground: true
-
-        - name: Create gcsfuse systemd service
-          ansible.builtin.copy:
-            dest: /etc/systemd/system/gcsfuse-lssd.service
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              [Unit]
-              Description=gcsfuse mount of all buckets
-              After=local-fs.target
-
-              [Service]
-              Type=simple
-              User=root
-              ExecStartPre=/bin/mkdir -p /gcs
-              ExecStart=gcsfuse --config-file /etc/gcsfuse-lssd.yml $(vars.hns_gcs_bucket) /gcs
-              ExecStop=fusermount3 -u /gcs
-
-              [Install]
-              WantedBy=slurmd.service multi-user.target
-
-        - name: Create read-only gcsfuse systemd service
-          ansible.builtin.copy:
-            dest: /etc/systemd/system/gcsfuse-ro.service
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              [Unit]
-              Description=gcsfuse-ro mount
-              After=local-fs.target
-
-              [Service]
-              Type=simple
-              User=root
-              ExecStartPre=/bin/mkdir -p /gcs-ro
-              ExecStart=gcsfuse --config-file /etc/gcsfuse-ro.yml $(vars.hns_gcs_bucket) /gcs-ro
-              ExecStop=fusermount3 -u /gcs-ro
-
-              [Install]
-              WantedBy=slurmd.service multi-user.target
-
-        post_tasks:
-        - name: Enable and restart gcsfuse
-          ansible.builtin.service:
-            name: gcsfuse-lssd.service
-            state: restarted
-            enabled: true
-
-        - name: Enable and restart gcsfuse-ro
-          ansible.builtin.service:
-            name: gcsfuse-ro.service
-            state: restarted
-            enabled: true
-
-  # Configure Cloud Storage FUSE for login/controller nodes
-  gcsfuse_runners:
-  - type: ansible-local
-    destination: gcsfuse.yml
-    content: |
-      ---
-      - name: Create Standard RWX gcsfuse mount
-        hosts: localhost
-        become: true
-        tasks:
-        - name: Create gcsfuse configuration
-          ansible.builtin.copy:
-            dest: /etc/gcsfuse.yml
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              file-system:
-                dir-mode: "777"
-                file-mode: "777"
-                rename-dir-limit: 20000
-                fuse-options: allow_other
-              foreground: true
-
-        - name: Create gcsfuse systemd service
-          ansible.builtin.copy:
-            dest: /etc/systemd/system/gcsfuse.service
-            owner: root
-            group: root
-            mode: 0o644
-            content: |
-              [Unit]
-              Description=gcsfuse mount of all buckets
-              After=local-fs.target
-
-              [Service]
-              Type=simple
-              User=root
-              ExecStartPre=/bin/mkdir -p /gcs
-              ExecStart=gcsfuse --config-file /etc/gcsfuse.yml $(vars.hns_gcs_bucket) /gcs
-              ExecStop=fusermount3 -u /gcs
-
-              [Install]
-              WantedBy=slurmd.service multi-user.target
-
-        post_tasks:
-        - name: Enable and restart gcsfuse
-          ansible.builtin.service:
-            name: gcsfuse.service
-            state: restarted
-            enabled: true
-
-deployment_groups:
-- group: image-env
-  modules:
-  - id: slurm-image-network
-    source: modules/network/vpc
-
-  - id: slurm-build-script
-    source: modules/scripts/startup-script
-    settings:
-      install_ansible: true
-      docker:
-        enabled: true
-      runners:
-      - type: data
-        destination: /etc/cluster_toolkit/a3ultra-prod-slurm-image.yaml
-        source: ../.ghpc/artifacts/expanded_blueprint.yaml
-      - type: data
-        destination: /var/tmp/slurm_vars.json
-        content: |
-          {
-            "reboot": false,
-            "install_cuda": false,
-            "install_gcsfuse": true,
-            "install_lustre": false,
-            "install_ompi": true,
-            "update_kernel": false,
-            "monitoring_agent": "cloud-ops",
-          }
-      - type: shell
-        destination: install_slurm.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C $(vars.build_slurm_from_git_ref) \
-              -i localhost, --limit localhost --connection=local \
-              -e @/var/tmp/slurm_vars.json \
-              ansible/playbook.yml
-            # this duplicates the ulimits configuration of the HPC VM Image
-      - type: data
-        destination: /etc/security/limits.d/99-unlimited.conf
-        content: |
-          * - memlock unlimited
-          * - nproc unlimited
-          * - stack unlimited
-          * - nofile 1048576
-          * - cpu unlimited
-          * - rtprio unlimited
-      - type: data
-        destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf
-        content: |
-          [Service]
-          LimitNOFILE=infinity
-      - type: data
-        destination: /etc/netplan/60-cloud-mrdma-init.yaml
-        content: |
-          network:
-            ethernets:
-              primary:
-                match:
-                  name: enp0s*
-                  driver: gve
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: true
-                dhcp6: true
-                dhcp6-overrides:
-                  use-domains: true
-                optional: true
-              secondary:
-                match:
-                  driver: gve
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: false
-                  use-dns: false
-                  use-ntp: false
-                dhcp6: true
-                dhcp6-overrides:
-                  use-domains: false
-                  use-dns: false
-                  use-ntp: false
-                optional: true
-              mrdma_devices:
-                match:
-                  driver: mlx5_core
-                dhcp-identifier: mac
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: true
-                  use-dns: false
-                  use-ntp: false
-                optional: true
-            version: 2
-      - type: ansible-local
-        destination: configure_gpu.yml
-        content: |
-          ---
-          - name: Install NVIDIA packages
-            hosts: all
-            become: true
-            vars:
-              distribution: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.','') }}"
-              cuda_repo_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ distribution }}/x86_64/cuda-keyring_1.1-1_all.deb
-              cuda_repo_filename: /tmp/{{ cuda_repo_url | basename }}
-              enable_nvidia_dcgm: false
-              nvidia_packages:
-              - cuda-toolkit-12-4
-              - datacenter-gpu-manager
-              - libnvidia-nscq-550
-            tasks:
-            - name: Download NVIDIA repository package
-              ansible.builtin.get_url:
-                url: "{{ cuda_repo_url }}"
-                dest: "{{ cuda_repo_filename }}"
-            - name: Install NVIDIA repository package
-              ansible.builtin.apt:
-                deb: "{{ cuda_repo_filename }}"
-                state: present
-            - name: Reduce NVIDIA repository priority
-              ansible.builtin.copy:
-                dest: /etc/apt/preferences.d/cuda-repository-pin-600
-                mode: 0o0644
-                owner: root
-                group: root
-                content: |
-                  Package: nsight-compute
-                  Pin: origin *ubuntu.com*
-                  Pin-Priority: -1
-
-                  Package: nsight-systems
-                  Pin: origin *ubuntu.com*
-                  Pin-Priority: -1
-
-                  Package: *
-                  Pin: release l=NVIDIA CUDA
-                  Pin-Priority: 400
-            - name: Install NVIDIA fabric and CUDA
-              ansible.builtin.apt:
-                name: "{{ item }}"
-                update_cache: true
-              loop: "{{ nvidia_packages }}"
-            - name: Freeze NVIDIA fabric and CUDA
-              ansible.builtin.dpkg_selections:
-                name: "{{ item }}"
-                selection: hold
-              loop: "{{ nvidia_packages }}"
-            post_tasks:
-            - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
-              ansible.builtin.service:
-                name: nvidia-dcgm.service
-                state: stopped
-                enabled: false
-      - type: ansible-local
-        destination: install_mellanox_drivers.yml
-        content: |
-          ---
-          - name: Update Netplan and Install Network Utils
-            hosts: all
-            become: true
-            tasks:
-            - name: Install Linux Modules Extra
-              ansible.builtin.package:
-                name:
-                - ibverbs-utils
-                state: present
-            - name: Apply netplan
-              ansible.builtin.command: netplan apply
-
-- group: image
-  modules:
-  - id: slurm-a3ultra-image
-    source: modules/packer/custom-image
-    kind: packer
-    settings:
-      disk_size: $(vars.disk_size_gb)
-      machine_type: $(vars.image_build_machine_type)
-      source_image_family: $(vars.base_image.family)
-      source_image_project_id: [$(vars.base_image.project)]
-      image_family: $(vars.instance_image.family)
-      omit_external_ip: false
-    use:
-    - slurm-image-network
-    - slurm-build-script
-
-- group: cluster-env
-  modules:
-  - id: a3ultra-slurm-net-0
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-0
-      mtu: 8896
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-0
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net0_range)
-
-  - id: a3ultra-slurm-net-1
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-1
-      mtu: 8896
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-1
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net1_range)
-
-  - id: a3ultra-slurm-rdma-net
-    source: modules/network/gpu-rdma-vpc
-    settings:
-      network_name: $(vars.deployment_name)-rdma-net
-      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
-      network_routing_mode: REGIONAL
-      nic_type: MRDMA
-      subnetworks_template:
-        name_prefix: $(vars.deployment_name)-mrdma-sub
-        count: 8
-        ip_range: $(vars.rdma_net_range)
-        region: $(vars.region)
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use:
-    - a3ultra-slurm-net-0
-    settings:
-      filestore_tier: HIGH_SCALE_SSD
-      size_gb: 10240
-      local_mount: /home
-      reserved_ip_range: $(vars.filestore_ip_range)
-      deletion_protection:
-        enabled: true
-        reason: Avoid data loss
-    outputs:
-    - network_storage
-
-- group: cluster
-  modules:
-  - id: a3ultra_startup
-    source: modules/scripts/startup-script
-    settings:
-      local_ssd_filesystem:
-        mountpoint: $(vars.local_ssd_mountpoint)
-        permissions: "1777" # must quote numeric filesystem permissions!
-      docker:
-        enabled: true
-        world_writable: true
-        daemon_config: |
-          {
-            "data-root": "$(vars.local_ssd_mountpoint)/docker"
-          }
-      runners: $(flatten([vars.a3u_runners]))
-
-  - id: a3_ultra_nodeset
-    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
-    use: [a3ultra-slurm-net-0, a3ultra_startup]
-    settings:
-      bandwidth_tier: gvnic_enabled
-      machine_type: a3-ultragpu-8g
-      instance_image_custom: true
-      enable_public_ips: true
-      node_count_static: $(vars.a3u_cluster_size)
-      node_count_dynamic_max: 0
-      enable_placement: false
-      disk_type: hyperdisk-balanced
-      on_host_maintenance: TERMINATE
-      reservation_name: $(vars.a3u_reservation_name)
-      additional_networks:
-        $(concat(
-          [{
-            network=null,
-            subnetwork=a3ultra-slurm-net-1.subnetwork_self_link,
-            subnetwork_project=vars.project_id,
-            nic_type="GVNIC",
-            queue_count=null,
-            network_ip="",
-            stack_type=null,
-            access_config=[],
-            ipv6_access_config=[],
-            alias_ip_range=[]
-          }],
-          a3ultra-slurm-rdma-net.subnetwork_interfaces
-        ))
-
-  - id: a3_ultra_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
-    use:
-    - a3_ultra_nodeset
-    settings:
-      exclusive: false
-      partition_name: a3ultra
-      is_default: true
-      partition_conf:
-        ResumeTimeout: 900
-        SuspendTimeout: 600
-        OverSubscribe: EXCLUSIVE
-
-  - id: controller_startup
-    source: modules/scripts/startup-script
-    settings:
-      runners: $(flatten([vars.shared_runners, vars.controller_runners, vars.gcsfuse_runners]))
-
-  - id: login_startup
-    source: modules/scripts/startup-script
-    settings:
-      runners: $(flatten([vars.shared_runners, vars.gcsfuse_runners]))
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
-    use: [a3ultra-slurm-net-0]
-    settings:
-      instance_image_custom: true
-      disk_size_gb: 300
-      enable_login_public_ips: true
-      machine_type: n2-standard-8
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
-    use:
-    - a3ultra-slurm-net-0
-    - a3_ultra_partition
-    - slurm_login
-    - homefs
-    settings:
-      enable_controller_public_ips: true
-      instance_image_custom: true
-      disk_type: pd-extreme
-      disk_size_gb: 300
-      machine_type: n2-standard-80
-      controller_startup_script: $(controller_startup.startup_script)
-      login_startup_script: $(login_startup.startup_script)
-      enable_external_prolog_epilog: true

From 57fd7af299d45636394e944e655b68c54faef398 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:38:57 -0800
Subject: [PATCH 099/140] Delete
 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml

---
 .../a3u-slurm-ubuntu-gcs/deployment.yaml      | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml

diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml
deleted file mode 100644
index d955eda1f4..0000000000
--- a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-# If using GCS as a terraform backend (suggested), add the following.  If not,
-# comment out or remove.
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket:  # Name of terraform state bucket.
-# End of optional section
-
-vars:
-  deployment_name:  # Unique name of this Cluster Toolkit Deployment, e.g. a3u-gcs
-  project_id:  # Your GCP project name
-  region:  # e.g. europe-west1
-  zone:  # e.g. europe-west1-b
-  a3u_reservation_name:  # reservation name, e.g. a3u-reservation-00
-  a3u_cluster_size:  # Number of A3-Ultra nodes in the cluster
-  hns_gcs_bucket:  # This bucket must have been previously created

From 0a3b379a0d6bad93103f8a32276b7a5d8d369b3d Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:39:22 -0800
Subject: [PATCH 100/140] Delete
 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh

---
 .../run-nccl-tests-via-ramble.sh              | 224 ------------------
 1 file changed, 224 deletions(-)
 delete mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh

diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
deleted file mode 100644
index 62061533f3..0000000000
--- a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
+++ /dev/null
@@ -1,224 +0,0 @@
-#!/bin/bash
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -eu
-
-trap "printf '\nCaught Ctrl+c. Exiting...\n'; exit" INT
-
-# Use current unix timestamp as a unique tag
-# for jobs submitted
-TAG=$(date +%s)
-TEST_DIR=nccl-tests-"${TAG}"
-SOFTWARE_INSTALL=/opt/apps
-
-cat <<EOF
-This script will install the following packages using on this VM:
-  build-essential
-  g++-12
-  gcc-12
-  gfortran-12
-  jq
-  libgcc-12-dev
-  libgfortran-12-dev
-  libopenmpi-dev
-  openmpi-bin
-  python3-venv
-
-And will clone spack (https://github.com/spack/spack.git)
-and ramble (https://github.com/GoogleCloudPlatform/ramble.git)
-to "${SOFTWARE_INSTALL}"/. Afterwards it will create a ramble workspace to run a
-number of NCCL tests in $(readlink -f "${TEST_DIR}"/). As part of the build
-process, spack will add some configuration files to your "${HOME}"/.spack
-directory.
-
-EOF
-read -rp "To continue, hit any key. To cancel, [Ctrl-c]"
-
-mkdir -p "${TEST_DIR}"
-
-# Install prerequisites
-sudo apt-get install -y g++-12 gfortran-12 build-essential gcc-12 libgfortran-12-dev libgcc-12-dev python3-venv jq libopenmpi-dev openmpi-bin
-
-# Install ramble and spack, and make world read/writeable.
-sudo git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble || true
-sudo git clone --depth 1 -c feature.manyFiles=true -b develop https://github.com/spack/spack.git "${SOFTWARE_INSTALL}"/spack || true
-sudo chmod -R a+w "${SOFTWARE_INSTALL}"/{ramble,spack}
-
-# Create python environment for ramble, and install requirements
-python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true
-source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate
-pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt
-
-# Activate ramble and spack
-. ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh
-. ${SOFTWARE_INSTALL}/spack/share/spack/setup-env.sh
-
-# Set up Spack external packages
-spack external find python diffutils xz ncurses flex curl openssl m4 openssh
-spack external find -p /usr/local/cuda cuda
-
-# Create a new workspace for this work
-ramble workspace create -a -d "${TEST_DIR}"
-
-# Populate ramble.yaml
-cat <<EOF >"${TEST_DIR}"/configs/ramble.yaml
-# Ramble Configuration for NCCL Tests
-ramble:
-  env_vars:
-    set:
-      OMPI_MCA_pml: "^ucx"
-      OMPI_MCA_btl: "^openib"
-      OMPI_MCA_btl_tcp_if_include: enp0s19
-
-      CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
-      NCCL_NET: gIB
-      NCCL_SOCKET_IFNAME: enp0s19,enp192s20
-      NCCL_CROSS_NIC: 0
-      NCCL_NET_GDR_LEVEL: PIX
-      NCCL_P2P_NET_CHUNKSIZE: 131072
-      NCCL_P2P_PCI_CHUNKSIZE: 131072
-      NCCL_P2P_NVL_CHUNKSIZE: 524288
-      NCCL_NVLS_CHUNKSIZE: 524288
-      NCCL_IB_GID_INDEX: 3
-      NCCL_IB_ADAPTIVE_ROUTING: 1
-      NCCL_IB_QPS_PER_CONNECTION: 4
-      NCCL_IB_TC: 52
-      NCCL_IB_FIFO_TC: 84
-      NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE: /usr/local/gib/configs/guest_config.txtpb
-      NCCL_TUNER_CONFIG_PATH: /usr/local/gib/configs/tuner_config.txtpb
-    prepend:
-    - paths:
-        LD_LIBRARY_PATH: /usr/local/gib/lib64
-
-  variables:
-    mpi_command: srun --mpi=pmix
-    batch_submit: 'sbatch {execute_experiment}'
-    processes_per_node: '{gpus_per_node}'
-    gpus_per_node: '8'
-  applications:
-    nccl-tests:
-      workloads:
-        '{workload}':
-          experiments:
-            '{workload}-{n_nodes}':
-              variants:
-                package_manager: spack
-              variables:
-                workload: [all-gather, all-reduce, reduce-scatter]
-                n_nodes: [2, 4, 8, 16, 32]
-              matrix:
-              - n_nodes
-              - workload
-
-  software:
-    packages:
-      pmix:
-        pkg_spec: pmix
-      mpi:
-        pkg_spec: openmpi +cuda cuda_arch=90
-      cuda:
-        pkg_spec: cuda@12.4.0
-      nccl:
-        pkg_spec: nccl@2.23.4-1 cuda_arch=90
-      nccl-tests:
-        pkg_spec: nccl-tests cuda_arch=90
-    environments:
-      nccl-tests:
-        packages: [cuda, mpi, nccl, nccl-tests, pmix]
-
-EOF
-
-# Populate slurm sbatch script
-cat <<EOF >"${TEST_DIR}"/configs/execute_experiment.tpl
-#!/bin/bash
-#SBATCH -J {experiment_name}-"${TAG}"
-#SBATCH --output={experiment_run_dir}/slurm-%j.out
-#SBATCH -N {n_nodes}
-#SBATCH --gpus-per-node=8
-#SBATCH --exclusive
-#SBATCH --ntasks-per-node={processes_per_node}
-
-cd "{experiment_run_dir}"
-{command}
-EOF
-
-# Get number of nodes available
-N_NODES=$(sinfo -h -o %D)
-
-# Print available benchmarks
-printf "\n--------- Setting up Benchmarks ----------\n"
-ramble workspace info --where '{n_nodes} <= '"$N_NODES"
-
-printf "\n------- About to run the following: ------\n\n"
-printf "source %s/ramble/env/bin/activate\n" "${SOFTWARE_INSTALL}"
-printf ". %s/ramble/share/ramble/setup-env.sh\n" "${SOFTWARE_INSTALL}"
-printf ". %s/spack/share/spack/setup-env.sh\n" "${SOFTWARE_INSTALL}"
-printf "ramble workspace activate %s\n" "${TEST_DIR}"
-printf "ramble workspace setup --where '{n_nodes} <= %s'\n" "${N_NODES}"
-printf "ramble on --where '{n_nodes} <= %s' \n" "${N_NODES}"
-
-# Set up experiments
-printf "\n--------- Setting up Benchmarks -------\n"
-printf "         This may take 20-30 minutes     \n"
-ramble workspace setup --where '{n_nodes} <= '"${N_NODES}"
-
-# Submit Experiments to Slurm
-printf "\n----------- Running Benchmarks --------\n"
-ramble on --where '{n_nodes} <= '"${N_NODES}"
-
-# Wait for all to be done
-# Use the TAG in the slurm jobs
-until [[ $(squeue -h -o %j | grep -c "${TAG}") -eq 0 ]]; do
-	clear
-	echo "waiting for $(squeue -h -o %j | grep -c "${TAG}") jobs to finish"
-	squeue
-	sleep 5
-done
-
-# Analyze
-ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
-
-# Summarize all results in summary.tsv
-cd "${TEST_DIR}"
-jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
-{
-  experiment_name: $exp.name,
-  workload: $exp.workload_name,
-  n_nodes: $exp.n_nodes,
-  Context: $context.name
-} +
-($context.foms | from_entries )
-| [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"])
-| @tsv' results.latest.json >summary.tsv
-
-# Print just the 8GB message sizes
-printf "\n--- SUMMARY for 8GB Message Sizes --\n"
-jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
-{
-  experiment_name: $exp.name,
-  workload: $exp.workload_name,
-  n_nodes: $exp.n_nodes,
-  Context: $context.name
-} +
-($context.foms | from_entries )
-| select(.Size | tonumber  > 8000000000)
-| [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"])
-| @tsv' results.latest.json
-printf "\nFor full results, see \"summary.tsv\"\n"
-
-printf "\n- To reactivate this ramble workspace, run -\n\n"
-printf "source %s/ramble/env/bin/activate\n" "${SOFTWARE_INSTALL}"
-printf ". %s/ramble/share/ramble/setup-env.sh\n" "${SOFTWARE_INSTALL}"
-printf ". %s/spack/share/spack/setup-env.sh\n" "${SOFTWARE_INSTALL}"
-printf "ramble workspace activate %s\n" "${TEST_DIR}"

From bd4e4ceb310cc82be4ec7a8617f1416a19412928 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:39:42 -0800
Subject: [PATCH 101/140] Delete
 examples/machine-learning/a3-ultragpu-8g/README.md

---
 .../machine-learning/a3-ultragpu-8g/README.md    | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/README.md

diff --git a/examples/machine-learning/a3-ultragpu-8g/README.md b/examples/machine-learning/a3-ultragpu-8g/README.md
deleted file mode 100644
index dfa3bb17c5..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# A3 Ultra Blueprints
-
-For further information on deploying an A3 Ultra cluster with Slurm, please
-see:
-
-[Create A3 Ultra Slurm Cluster](https://cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster)
-
-If you are unable to access these documents, please contact your
-[Technical Account Manager (TAM)](https://cloud.google.com/tam).
-
-## Deploy A3 Ultra compute VM with custom startup-scripts
-
-Customers can deploy [a3ultra-vm.yaml] blueprint to deploy 2 A3 Ultra VMs. You
-can also specify custom startup-scripts to run in the blueprint.
-
-[a3ultra-vm.yaml]: ./a3ultra-vm.yaml

From a897c6a35e828d08e0a88776f4a89af442ec31ab Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:40:00 -0800
Subject: [PATCH 102/140] Delete
 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

---
 .../a3ultra-slurm-blueprint.yaml              | 451 ------------------
 1 file changed, 451 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
deleted file mode 100644
index 29b08add88..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
+++ /dev/null
@@ -1,451 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-# This blueprint uses private preview functionality in limited availability,
-# see README.md for further information
-
-# This blueprint requires a Cluster Toolkit binary built from a
-# release >= 1.44.0
-
-blueprint_name: a3ultra-slurm
-
-vars:
-  deployment_name: # supply deployment name
-  project_id: # supply project ID
-  region: # supply region
-  zone: # supply zone
-  a3u_cluster_size: # supply cluster size
-  a3u_reservation_name: # supply reservation name
-  # Image settings
-  base_image:
-    project: ubuntu-os-accelerator-images
-    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
-  image_build_machine_type: n2-standard-16
-  build_slurm_from_git_ref: 6.8.7
-  # Cluster env settings
-  # net0 and filestore ranges must not overlap
-  net0_range: 192.168.0.0/19
-  filestore_ip_range: 192.168.32.0/24
-  net1_range: 192.168.64.0/18
-  rdma_net_range: 192.168.128.0/18
-  # Cluster Settings
-  local_ssd_mountpoint: /mnt/localssd
-  instance_image:
-    project: $(vars.project_id)
-    family: $(vars.deployment_name)-u22
-  disk_size_gb: 200
-  nccl_plugin_version: v1.0.2
-
-deployment_groups:
-- group: image-env
-  modules:
-  - id: slurm-image-network
-    source: modules/network/vpc
-
-  - id: slurm-build-script
-    source: modules/scripts/startup-script
-    settings:
-      install_ansible: true
-      docker:
-        enabled: true
-      runners:
-      - type: data
-        destination: /etc/cluster_toolkit/a3ultra-prod-slurm-image.yaml
-        source: ../.ghpc/artifacts/expanded_blueprint.yaml
-      - type: data
-        destination: /var/tmp/slurm_vars.json
-        content: |
-          {
-            "reboot": false,
-            "install_cuda": false,
-            "install_gcsfuse": true,
-            "install_lustre": false,
-            "install_ompi": true,
-            "update_kernel": false,
-            "monitoring_agent": "cloud-ops",
-          }
-      - type: shell
-        destination: install_slurm.sh
-        content: |
-          #!/bin/bash
-          set -e -o pipefail
-          ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C $(vars.build_slurm_from_git_ref) \
-              -i localhost, --limit localhost --connection=local \
-              -e @/var/tmp/slurm_vars.json \
-              ansible/playbook.yml
-            # this duplicates the ulimits configuration of the HPC VM Image
-      - type: data
-        destination: /etc/security/limits.d/99-unlimited.conf
-        content: |
-          * - memlock unlimited
-          * - nproc unlimited
-          * - stack unlimited
-          * - nofile 1048576
-          * - cpu unlimited
-          * - rtprio unlimited
-      - type: data
-        destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf
-        content: |
-          [Service]
-          LimitNOFILE=infinity
-      - type: data
-        destination: /etc/netplan/60-cloud-mrdma-init.yaml
-        content: |
-          network:
-            ethernets:
-              primary:
-                match:
-                  name: enp0s*
-                  driver: gve
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: true
-                dhcp6: true
-                dhcp6-overrides:
-                  use-domains: true
-                optional: true
-              secondary:
-                match:
-                  driver: gve
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: false
-                  use-dns: false
-                  use-ntp: false
-                dhcp6: true
-                dhcp6-overrides:
-                  use-domains: false
-                  use-dns: false
-                  use-ntp: false
-                optional: true
-              mrdma_devices:
-                match:
-                  driver: mlx5_core
-                dhcp-identifier: mac
-                dhcp4: true
-                dhcp4-overrides:
-                  use-domains: true
-                  use-dns: false
-                  use-ntp: false
-                optional: true
-            version: 2
-      - type: ansible-local
-        destination: configure_gpu.yml
-        content: |
-          ---
-          - name: Install NVIDIA packages
-            hosts: all
-            become: true
-            vars:
-              distribution: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.','') }}"
-              cuda_repo_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ distribution }}/x86_64/cuda-keyring_1.1-1_all.deb
-              cuda_repo_filename: /tmp/{{ cuda_repo_url | basename }}
-              enable_nvidia_dcgm: false
-              nvidia_packages:
-              - cuda-toolkit-12-4
-              - datacenter-gpu-manager
-              - libnvidia-nscq-550
-            tasks:
-            - name: Download NVIDIA repository package
-              ansible.builtin.get_url:
-                url: "{{ cuda_repo_url }}"
-                dest: "{{ cuda_repo_filename }}"
-            - name: Install NVIDIA repository package
-              ansible.builtin.apt:
-                deb: "{{ cuda_repo_filename }}"
-                state: present
-            - name: Reduce NVIDIA repository priority
-              ansible.builtin.copy:
-                dest: /etc/apt/preferences.d/cuda-repository-pin-600
-                mode: 0o0644
-                owner: root
-                group: root
-                content: |
-                  Package: nsight-compute
-                  Pin: origin *ubuntu.com*
-                  Pin-Priority: -1
-
-                  Package: nsight-systems
-                  Pin: origin *ubuntu.com*
-                  Pin-Priority: -1
-
-                  Package: *
-                  Pin: release l=NVIDIA CUDA
-                  Pin-Priority: 400
-            - name: Install NVIDIA fabric and CUDA
-              ansible.builtin.apt:
-                name: "{{ item }}"
-                update_cache: true
-              loop: "{{ nvidia_packages }}"
-            - name: Freeze NVIDIA fabric and CUDA
-              ansible.builtin.dpkg_selections:
-                name: "{{ item }}"
-                selection: hold
-              loop: "{{ nvidia_packages }}"
-            post_tasks:
-            - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
-              ansible.builtin.service:
-                name: nvidia-dcgm.service
-                state: stopped
-                enabled: false
-      - type: ansible-local
-        destination: install_mellanox_drivers.yml
-        content: |
-          ---
-          - name: Update Netplan and Install Network Utils
-            hosts: all
-            become: true
-            tasks:
-            - name: Install Linux Modules Extra
-              ansible.builtin.package:
-                name:
-                - ibverbs-utils
-                state: present
-            - name: Apply netplan
-              ansible.builtin.command: netplan apply
-
-- group: image
-  modules:
-  - id: slurm-a3ultra-image
-    source: modules/packer/custom-image
-    kind: packer
-    settings:
-      disk_size: $(vars.disk_size_gb)
-      machine_type: $(vars.image_build_machine_type)
-      source_image_family: $(vars.base_image.family)
-      source_image_project_id: [$(vars.base_image.project)]
-      image_family: $(vars.instance_image.family)
-      omit_external_ip: false
-    use:
-    - slurm-image-network
-    - slurm-build-script
-
-- group: cluster-env
-  modules:
-  - id: a3ultra-slurm-net-0
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-0
-      mtu: 8896
-      enable_internal_traffic: false # Setting firewall below instead
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-0
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net0_range)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-0
-        ranges: [$(vars.net0_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: a3ultra-slurm-net-1
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-1
-      mtu: 8896
-      enable_internal_traffic: false # Setting firewall below instead
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-1
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net1_range)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-1
-        ranges: [$(vars.net1_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: a3ultra-slurm-rdma-net
-    source: modules/network/gpu-rdma-vpc
-    settings:
-      network_name: $(vars.deployment_name)-rdma-net
-      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
-      network_routing_mode: REGIONAL
-      subnetworks_template:
-        name_prefix: $(vars.deployment_name)-mrdma-sub
-        count: 8
-        ip_range: $(vars.rdma_net_range)
-        region: $(vars.region)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-rdma
-        ranges: [$(vars.rdma_net_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use:
-    - a3ultra-slurm-net-0
-    settings:
-      filestore_tier: HIGH_SCALE_SSD
-      size_gb: 10240
-      local_mount: /home
-      reserved_ip_range: $(vars.filestore_ip_range)
-      deletion_protection:
-        enabled: true
-        reason: Avoid data loss
-    outputs:
-    - network_storage
-
-- group: cluster
-  modules:
-  - id: a3ultra_startup
-    source: modules/scripts/startup-script
-    settings:
-      local_ssd_filesystem:
-        mountpoint: $(vars.local_ssd_mountpoint)
-        permissions: "1777" # must quote numeric filesystem permissions!
-      docker:
-        enabled: true
-        world_writable: true
-        daemon_config: |
-          {
-            "data-root": "$(vars.local_ssd_mountpoint)/docker"
-          }
-      runners:
-      - type: data
-        destination: /etc/enroot/enroot.conf
-        content: |
-          ENROOT_RUNTIME_PATH    $(vars.local_ssd_mountpoint)/${UID}/enroot/runtime
-          ENROOT_CACHE_PATH      $(vars.local_ssd_mountpoint)/${UID}/enroot/cache
-          ENROOT_DATA_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot/data
-          ENROOT_TEMP_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot
-      - type: ansible-local
-        destination: nccl_plugin.yml
-        content: |
-          ---
-          - name: Install NCCL plugin for A3 Ultra series
-            hosts: all
-            become: true
-            tasks:
-            - name: Add SystemD unit for NCCL plugin installation
-              ansible.builtin.copy:
-                dest: /etc/systemd/system/nccl-plugin@.service
-                mode: 0o0644
-                content: |
-                  [Unit]
-                  After=network-online.target
-                  Before=slurmd.service
-
-                  [Service]
-                  Type=oneshot
-                  ExecStartPre=/usr/bin/rm -rf /usr/local/gib
-                  ExecStartPre=/usr/bin/mkdir -p /usr/local/gib
-                  ExecStartPre=/snap/bin/gcloud auth configure-docker --quiet us-docker.pkg.dev
-                  ExecStart=/usr/bin/docker run --rm --name nccl-gib-installer --volume /usr/local/gib:/var/lib/gib \
-                      us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:%i install --install-nccl
-
-                  [Install]
-                  WantedBy=slurmd.service
-              notify:
-              - Reload SystemD
-            handlers:
-            - name: Reload SystemD
-              ansible.builtin.systemd:
-                daemon_reload: true
-            post_tasks:
-            - name: Enable NCCL plugin SystemD unit
-              ansible.builtin.service:
-                name: nccl-plugin@$(vars.nccl_plugin_version).service
-                state: started
-                enabled: true
-
-  - id: a3_ultra_nodeset
-    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
-    use: [a3ultra-slurm-net-0, a3ultra_startup]
-    settings:
-      bandwidth_tier: gvnic_enabled
-      machine_type: a3-ultragpu-8g
-      instance_image_custom: true
-      enable_public_ips: true
-      node_count_static: $(vars.a3u_cluster_size)
-      node_count_dynamic_max: 0
-      enable_placement: false
-      disk_type: hyperdisk-balanced
-      on_host_maintenance: TERMINATE
-      reservation_name: $(vars.a3u_reservation_name)
-      additional_networks:
-        $(concat(
-          [{
-            network=null,
-            subnetwork=a3ultra-slurm-net-1.subnetwork_self_link,
-            subnetwork_project=vars.project_id,
-            nic_type="GVNIC",
-            queue_count=null,
-            network_ip="",
-            stack_type=null,
-            access_config=[],
-            ipv6_access_config=[],
-            alias_ip_range=[]
-          }],
-          a3ultra-slurm-rdma-net.subnetwork_interfaces
-        ))
-
-  - id: a3_ultra_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
-    use:
-    - a3_ultra_nodeset
-    settings:
-      exclusive: false
-      partition_name: a3ultra
-      is_default: true
-      partition_conf:
-        ResumeTimeout: 900
-        SuspendTimeout: 600
-
-  - id: slurm_login
-    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
-    use: [a3ultra-slurm-net-0]
-    settings:
-      instance_image_custom: true
-      disk_size_gb: 300
-      enable_login_public_ips: true
-      machine_type: n2-standard-8
-
-  - id: controller_startup
-    source: modules/scripts/startup-script
-    settings:
-      runners:
-      - type: shell
-        destination: stage_scripts.sh
-        content: |
-          #!/bin/bash
-          SLURM_ROOT=/opt/apps/adm/slurm
-          PARTITION_NAME=$(a3_ultra_partition.partitions[0].partition_name)
-          mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
-          mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
-          ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
-    use:
-    - a3ultra-slurm-net-0
-    - a3_ultra_partition
-    - slurm_login
-    - homefs
-    settings:
-      enable_controller_public_ips: true
-      instance_image_custom: true
-      disk_type: pd-extreme
-      disk_size_gb: 300
-      machine_type: n2-standard-80
-      controller_startup_script: $(controller_startup.startup_script)
-      enable_external_prolog_epilog: true

From 5cad6083080ee85b4286a0691a3069808512554f Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:42:51 -0800
Subject: [PATCH 103/140] Delete
 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml

---
 .../a3ultra-slurm-deployment.yaml             | 26 -------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml

diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml
deleted file mode 100644
index 6fa29af09e..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-terraform_backend_defaults:
-  type: gcs
-  configuration:
-    bucket: # supply existing bucket to store Terraform state
-
-vars:
-  deployment_name: # supply unique deployment name
-  project_id: # supply existing project id
-  region: # supply region with a3-ultragpu-8g capacity in reservation
-  zone: # supply zone with a3-ultragpu-8g capacity in reservation
-  a3u_reservation_name: # supply a3-ultragpu-8g reservation name
-  a3u_cluster_size: # supply a3-ultragpu-8g reservation size

From ec6d723b95f57e40ab85787bec3de873170086fb Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:51:31 -0800
Subject: [PATCH 104/140] Delete
 examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml

---
 .../a3-ultragpu-8g/a3ultra-vm.yaml            | 151 ------------------
 1 file changed, 151 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml

diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml
deleted file mode 100644
index 25d7fd83bf..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-blueprint_name: a3ultra-vm-instance
-
-vars:
-  project_id: # supply project ID
-  deployment_name: a3ultra-vm-instance
-  region: europe-west1
-  zone: europe-west1-b
-  instance_image:
-    project: ubuntu-os-accelerator-images
-    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
-  net0_range: 192.168.0.0/19
-  net1_range: 192.168.64.0/18
-  filestore_ip_range: 192.168.32.0/24
-  rdma_net_range: 192.168.128.0/18
-  hostname_prefix: $(vars.deployment_name)-beowulf
-
-deployment_groups:
-- group: primary
-  modules:
-
-  - id: a3ultra-net-0
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-0
-      mtu: 8896
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-0
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net0_range)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-0
-        ranges: [$(vars.net0_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: a3ultra-net-1
-    source: modules/network/vpc
-    settings:
-      network_name: $(vars.deployment_name)-net-1
-      mtu: 8896
-      subnetworks:
-      - subnet_name: $(vars.deployment_name)-sub-1
-        subnet_region: $(vars.region)
-        subnet_ip: $(vars.net1_range)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-1
-        ranges: [$(vars.net1_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: a3ultra-rdma-net
-    source: modules/network/gpu-rdma-vpc
-    settings:
-      network_name: $(vars.deployment_name)-rdma-net
-      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
-      network_routing_mode: REGIONAL
-      subnetworks_template:
-        name_prefix: $(vars.deployment_name)-mrdma-sub
-        count: 8
-        ip_range: $(vars.rdma_net_range)
-        region: $(vars.region)
-      firewall_rules:
-      - name: $(vars.deployment_name)-internal-rdma
-        ranges: [$(vars.rdma_net_range)]
-        allow:
-        - protocol: tcp
-        - protocol: udp
-        - protocol: icmp
-
-  - id: homefs
-    source: modules/file-system/filestore
-    use: [a3ultra-net-0]
-    settings:
-      filestore_tier: HIGH_SCALE_SSD
-      size_gb: 10240
-      local_mount: /home
-      reserved_ip_range: $(vars.filestore_ip_range)
-    outputs:
-    - network_storage
-
-  - id: startup-script
-    source: modules/scripts/startup-script
-    settings:
-      configure_ssh_host_patterns:
-      - $(vars.hostname_prefix)-*
-
-  - id: a3ultra-vms
-    source: modules/compute/vm-instance
-    use: [startup-script, homefs]
-    settings:
-      machine_type: a3-ultragpu-8g
-      instance_count: 2
-      name_prefix: $(vars.hostname_prefix)
-      disk_type: hyperdisk-balanced
-      automatic_restart: true
-      on_host_maintenance: TERMINATE
-      reservation_name: # supply reservation name
-      network_interfaces:
-        $(concat(
-          [{
-            network=null,
-            subnetwork=a3ultra-net-0.subnetwork_self_link,
-            subnetwork_project=vars.project_id,
-            nic_type="GVNIC",
-            queue_count=null,
-            network_ip=null,
-            stack_type=null,
-            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
-            ipv6_access_config=[],
-            alias_ip_range=[]
-          },
-          {
-            network=null,
-            subnetwork=a3ultra-net-1.subnetwork_self_link,
-            subnetwork_project=vars.project_id,
-            nic_type="GVNIC",
-            queue_count=null,
-            network_ip=null,
-            stack_type=null,
-            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
-            ipv6_access_config=[],
-            alias_ip_range=[]
-          }],
-          a3ultra-rdma-net.subnetwork_interfaces,
-        ))
-
-  - id: wait-for-vms
-    source: community/modules/scripts/wait-for-startup
-    settings:
-      instance_names: $(a3ultra-vms.name)
-      timeout: 7200

From 9e13c07fcedf29a502f68c6c5001873095947b95 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:51:46 -0800
Subject: [PATCH 105/140] Delete
 examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md

---
 .../a3-ultragpu-8g/nccl-tests/README.md       | 89 -------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md

diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md
deleted file mode 100644
index 3f6dfab5c9..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md
+++ /dev/null
@@ -1,89 +0,0 @@
-The examples in this directory are used to show how enroot + pyxis can be used
-to launch containerized workloads via Slurm.
-
-Contents:
-
-* `build-nccl-tests.sh`: A Slurm batch script for building the nccl-tests.
-* `run-nccl-tests.sh`: A Slurm batch script for running the nccl-tests
-  `all_reduce_perf` benchmark.
-* `import_container.sh`: Uses enroot to create a squashfs container image. Added
-  for reference only. enroot import happens within the `build-nccl-tests.sh`.
-
-# Running NCCL-Tests via Enroot/Pyxis
-
-In general the workflow to deploy GPUDirect-RDMA-enabled workloads via enroot-pyxis is
-the following:
-
-1. Convert your container into a squashfs based container image
-2. Set required environment variables
-3. Run your application workload
-
-## TLDR
-
-For an end-to-end example, copy the `build-nccl-tests.sh` and
-`run-nccl-tests.sh` to your login node.
-
-And run the following:
-
-```text
-BUILD_JOB=$(sbatch --parsable build-nccl-tests.sh) # takes ~4 minutes
-sbatch -d afterok:${BUILD_JOB} run-nccl-tests.sh # takes ~3 minutes
-```
-
-The latter should result in a slurm-XX.out file that contains the result of the nccl
-`all_gather_perf` benchmark:
-
-```text
-#
-#                                                              out-of-place                       in-place
-#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
-#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
-   268435456       4194304     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-   536870912       8388608     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-  1073741824      16777216     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-  2147483648      33554432     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-  4294967296      67108864     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-  8589934592     134217728     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
-# Out of bounds values : 0 OK
-# Avg bus bandwidth    : XXX.XX
-#
-```
-
-For more details, follow the remainder of this README.
-
-## Detailed Instructions
-
-All of the following should be done on the login node of your slurm cluster,
-and while somewhere on the shared Filestore filesystem (typically the user's
-home directory).
-
-### Building NCCL-tests
-
-See build-nccl-tests.sh for an example. Within it, you will see that first we'll
-create a squashfs version of the container using we want to launch using `enroot
-import`. We do this because otherwise we'd be pulling the (typically more than
-10GB) image multiple times from the source on each node, converting to sqsh each
-time, etc, which would make the job launch longer.
-
-For building the nccl-tests binaries, we use `pyxis` to run the enroot container
-and build the nccl-tests within that container to ensure the resulting binarier
-are compatible with the container environment.
-
-Both of the above (importing and building) are accomplished by running:
-
-```text
-sbatch build-nccl-tests.sh
-```
-
-### Running your application on a3-ultra instances
-
-For a complete example, run:
-
-```text
-sbatch run-nccl-tests.sh
-```
-
-The output will appear in in a `slurm-<job#>.log` file. If the name of your a3-ultragpu
-partition is different than "a3ultra", you will need to modify the `build-nccl-tests.sh`
-and `run-nccl-tests.sh` scripts's  `#SBATCH --partition` setting. Alternatively, you
-can run `sbatch -p <your partition> <script>`.

From eadeba0bdade5a30d765c13714491381b1124f6e Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:52:02 -0800
Subject: [PATCH 106/140] Delete
 examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh

---
 .../nccl-tests/build-nccl-tests.sh            | 44 -------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh

diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh
deleted file mode 100644
index ee3578e241..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2024 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#SBATCH --exclusive
-#SBATCH --ntasks=1
-#SBATCH --partition=a3ultra
-#SBATCH --ntasks-per-node=1
-#SBATCH --gpus-per-node=8
-
-# Usage: sbatch build-nccl-tests.sh
-
-set -x
-
-CONTAINER_IMAGE=./nvidia+pytorch+24.09-py3.sqsh
-
-# Import the pytorch container to enroot if not already present.
-if [ ! -f ${CONTAINER_IMAGE} ]; then
-	# This creates a file named "nvidia+pytorch+24.09-py3.sqsh", which
-	# uses ~18 GB of disk space. This should be run on a filesystem that
-	# can be seen by all worker nodes
-	enroot import docker://nvcr.io#nvidia/pytorch:24.09-py3
-fi
-
-# Install nccl-tests using openmpi from within pytorch container
-srun --container-mounts="$PWD:/nccl" \
-	--container-image=${CONTAINER_IMAGE} \
-	bash -c "
-       cd /nccl &&
-       git clone https://github.com/NVIDIA/nccl-tests.git &&
-       cd /nccl/nccl-tests/ &&
-       MPI=1 CC=mpicc CXX=mpicxx make -j
-    "

From f9d537b612fa983af2f08e115d8ff3aab00ad3fd Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:52:14 -0800
Subject: [PATCH 107/140] Delete
 examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh

---
 .../nccl-tests/import_pytorch_container.sh    | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh

diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh
deleted file mode 100644
index ea903be55a..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# Copyright 2024 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This creates a file named "nvidia+pytorch+24.09-py3.sqsh", which
-# uses ~18 GB of disk space. This should be run on a filesystem that
-# can be seen by all worker nodes
-enroot import docker://nvcr.io#nvidia/pytorch:24.09-py3

From e78035d0f31ce6f763b6218a8189ad3b426cd3e1 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:52:28 -0800
Subject: [PATCH 108/140] Delete
 examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh

---
 .../nccl-tests/run-nccl-tests.sh              | 58 -------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh

diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh
deleted file mode 100644
index 08352c1578..0000000000
--- a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#SBATCH --partition=a3ultra
-#SBATCH --mem=0
-#SBATCH -N 2
-#SBATCH --gpus-per-node=8
-#SBATCH --ntasks-per-node=8
-
-# Usage: sbatch run-nccl-tests.sh
-
-set -x
-# This should be set to the squashfs file that you created for your application
-CONTAINER_IMAGE=./nvidia+pytorch+24.09-py3.sqsh
-
-# Set up NCCL Environment variables
-# The following two can be useful for debugging
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=INIT,NET
-
-# These parameters should not be modified
-source /usr/local/gib/scripts/set_nccl_env.sh
-export NCCL_NET=gIB
-export NCCL_SOCKET_IFNAME=enp0s19,enp192s20
-
-# Mount /var/tmp to allow the rest of the enroot container to be read-only, and
-# mount current $PWD to /nccl to for accessing nccl-tests binary
-CONTAINER_MOUNTS="/var/tmp:/var/tmp"
-
-# Mount PWD to /nccl in the enroot container
-CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"$PWD:/nccl"
-
-# Mount required directories for gIB libnccl-net
-CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"/usr/local/gib"
-
-# Run the workload
-srun -l \
-	-N "${SLURM_NNODES}" \
-	--ntasks-per-node=8 \
-	--mpi=pmi2 \
-	--container-image="${CONTAINER_IMAGE}" \
-	--container-mounts="${CONTAINER_MOUNTS}" \
-	sh -c "
-  export LD_LIBRARY_PATH=/usr/local/gib/lib64:/usr/lib/x86_64-linux-gnu:\$LD_LIBRARY_PATH;
-  /nccl/nccl-tests/build/all_gather_perf -b 256M -e 8G -f 2 -g 1 -w 5 --iters 200;
-  "

From 63aaf375224f95d8c863dca38efb55e3e6b5a03d Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:52:40 -0800
Subject: [PATCH 109/140] Delete
 tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml

---
 .../post-destroy-tasks/delete-image.yml       | 30 -------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml

diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml b/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml
deleted file mode 100644
index 9ac0457ed4..0000000000
--- a/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-- name: Assert variables are defined
-  ansible.builtin.assert:
-    that:
-    - project is defined
-    - build is defined
-
-- name: Get Image Name
-  register: image_name
-  ansible.builtin.command: gcloud compute images list --project={{ project }} --no-standard-images --filter="labels.ghpc_deployment~{{ build }}" --format='get(name)' --limit=1
-  ignore_errors: yes
-
-- name: Delete Image
-  register: delete_image_result
-  changed_when: delete_image_result.rc == 0
-  ansible.builtin.command: gcloud compute images delete --project={{ project }} --quiet {{ image_name.stdout }}
-  when: image_name.rc == 0 and image_name.stdout != ""

From bcc3ec94a6bc79d71f508c3aedf2fc0655792d96 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:52:51 -0800
Subject: [PATCH 110/140] Delete
 tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml

---
 .../builds/ml-a3-ultragpu-slurm.yaml          | 52 -------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml

diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
deleted file mode 100644
index 0298ee23ca..0000000000
--- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-tags:
-- m.custom-image
-- m.filestore
-- m.gpu-rdma-vpc
-- m.schedmd-slurm-gcp-v6-controller
-- m.schedmd-slurm-gcp-v6-login
-- m.schedmd-slurm-gcp-v6-nodeset
-- m.schedmd-slurm-gcp-v6-partition
-- m.startup-script
-- m.vpc
-- slurm6
-
-timeout: 14400s  # 4hr
-steps:
-- id: ml-a3-ultragpu-slurm-image
-  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
-  entrypoint: /bin/bash
-  env:
-  - "ANSIBLE_HOST_KEY_CHECKING=false"
-  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
-  args:
-  - -c
-  - |
-    set -x -e
-    cd /workspace && make
-    BUILD_ID_FULL=$BUILD_ID
-    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
-    REGION=europe-west1
-    ZONE=europe-west1-b
-    BLUEPRINT="/workspace/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml"
-    sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT}
-    sed -i -e '/reason:/d' $${BLUEPRINT}
-    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
-        --user=sa_106486320838376751393 \
-        --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
-        --extra-vars="region=$${REGION} zone=$${ZONE}" \
-        --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml"

From b33ea04bb70ce6d78161d00ddce4a3d8f1fe52b2 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:53:03 -0800
Subject: [PATCH 111/140] Delete
 tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml

---
 .../tests/ml-a3-ultragpu-slurm.yml            | 45 -------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml

diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
deleted file mode 100644
index 8d62d4c5da..0000000000
--- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-# region, zone must be defined in build file with --extra-vars flag!
-test_name: a3u-slurm
-deployment_name: a3u-slurm-{{ build }}
-slurm_cluster_name: "a3u{{ build[0:4] }}"
-workspace: /workspace
-blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml"
-login_node: "{{ slurm_cluster_name }}-slurm-login-*"
-controller_node: "{{ slurm_cluster_name }}-controller"
-region: europe-west1
-zone: europe-west1-b
-network: "{{ deployment_name }}-net-0"
-post_deploy_tests:
-- test-validation/test-mounts.yml
-- test-validation/test-partitions.yml
-- test-validation/test-enroot.yml
-post_destroy_tasks:
-- post-destroy-tasks/delete-image.yml
-custom_vars:
-  partitions:
-  - a3ultra
-  mounts:
-  - /home
-cli_deployment_vars:
-  region: "{{ region }}"
-  zone: "{{ zone }}"
-  slurm_cluster_name: "{{ slurm_cluster_name }}"
-  disk_size_gb: 200
-  a3u_cluster_size: 2
-  a3u_reservation_name: slurm-dev-gcp-a3u-gsc

From a570e9b5970c0ea5181971398f3e52529bb1c8d8 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:54:25 -0800
Subject: [PATCH 112/140] Update validate_tests_metadata.py

---
 tools/cloud-build/daily-tests/validate_tests_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py
index 265a71f896..16905a1f72 100644
--- a/tools/cloud-build/daily-tests/validate_tests_metadata.py
+++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py
@@ -127,7 +127,7 @@ def check_tags(self, build_path: str) -> None:
         if missing_mod_tags:
             hint = "\n- ".join([""] + sorted(missing_mod_tags))
             self.fail(msg=f"Some used modules aren't declared\nHINT: add following tags to {build_path}: {hint}")
-        self.assertEqual(declared_mod_tags, required_mod_tags)
+        self.assertEquals(declared_mod_tags, required_mod_tags)
 
         self.assertNotEqual(tags & CATEGORICAL_TAGS, set(), msg=f"No categorical tags, pick/add one: {CATEGORICAL_TAGS}")
 

From e6554ed34e7116e8ea3bab5a2bdb979b60a41ce2 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 7 Jan 2025 08:55:18 -0800
Subject: [PATCH 113/140] Update variables.tf

---
 modules/compute/vm-instance/variables.tf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf
index 7fa0caf052..1c32348587 100644
--- a/modules/compute/vm-instance/variables.tf
+++ b/modules/compute/vm-instance/variables.tf
@@ -223,9 +223,9 @@ variable "network_interfaces" {
   }
   validation {
     condition = alltrue([
-      for ni in var.network_interfaces : ni.nic_type == "MRDMA" || ni.nic_type == "GVNIC" || ni.nic_type == "VIRTIO_NET" || ni.nic_type == null
+      for ni in var.network_interfaces : ni.nic_type == "GVNIC" || ni.nic_type == "VIRTIO_NET" || ni.nic_type == "MRDMA" || ni.nic_type == "IRDMA" || ni.nic_type == null
     ])
-    error_message = "In the variable network_interfaces, field \"nic_type\" must be either \"MRDMA\" ,\"GVNIC\", \"VIRTIO_NET\" or null."
+    error_message = "In the variable network_interfaces, field \"nic_type\" must be \"GVNIC\", \"VIRTIO_NET\", \"MRDMA\", \"IRDMA\", or null."
   }
   validation {
     condition = alltrue([

From 5fb01670bd353fc6228eb9cae1343b2f2a4fa67a Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Tue, 7 Jan 2025 18:05:40 +0000
Subject: [PATCH 114/140] Update reservation used by GKE A3 Ultra integration
 test

---
 tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
index a1dd8c72f4..97d559d271 100644
--- a/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
+++ b/tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
@@ -24,7 +24,7 @@ network: "{{ deployment_name }}-net-0"
 region: europe-west1
 zone: europe-west1-b
 remote_node: "{{ deployment_name }}-remote-node-0"
-extended_reservation: slurm-dev-gcp-a3u-gsc
+extended_reservation: hpc-exfr-2
 static_node_count: 1
 cli_deployment_vars:
   region: "{{ region }}"

From 19f444b02e2bd686dcf1919e08addbaa4ed3cf40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Thu, 2 Jan 2025 10:33:42 +0000
Subject: [PATCH 115/140] Add support for Rocky 9 and Redhat 7, 8, 9

---
 .../startup-script/files/install_ansible.sh   | 44 ++++++++++++++-----
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/modules/scripts/startup-script/files/install_ansible.sh b/modules/scripts/startup-script/files/install_ansible.sh
index 7146ecea36..3757cf0eee 100644
--- a/modules/scripts/startup-script/files/install_ansible.sh
+++ b/modules/scripts/startup-script/files/install_ansible.sh
@@ -76,16 +76,28 @@ get_python_minor_version() {
 # newly installed packaged.
 install_python3_yum() {
 	major_version=$(rpm -E "%{rhel}")
-	enable_repo=""
-	if [ "${major_version}" -eq "7" ]; then
-		enable_repo="base,epel"
+	set -- "--disablerepo=*" "--enablerepo=baseos,appstream"
+
+	if grep -qi 'ID="rhel"' /etc/os-release && {
+		[ "${major_version}" -eq "7" ] || [ "${major_version}" -eq "8" ] ||
+			[ "${major_version}" -eq "9" ]
+	}; then
+		# Do not set --disablerepo / --enablerepo on RedHat, due to complex repo names
+		# clear array
+		set --
+	elif [ "${major_version}" -eq "7" ]; then
+		set -- "--disablerepo=*" "--enablerepo=base,epel"
 	elif [ "${major_version}" -eq "8" ]; then
-		enable_repo="baseos"
+		# use defaults
+		true
+	elif [ "${major_version}" -eq "9" ]; then
+		# use defaults
+		true
 	else
 		echo "Unsupported version of centos/RHEL/Rocky"
 		return 1
 	fi
-	yum install --disablerepo="*" --enablerepo="${enable_repo}" -y python3 python3-pip python3-venv
+	yum install "$@" -y python3 python3-pip
 	python_path=$(rpm -ql python3 | grep 'bin/python3$')
 }
 
@@ -115,16 +127,28 @@ install_python3() {
 # newly installed packaged.
 install_pip3_yum() {
 	major_version=$(rpm -E "%{rhel}")
-	enable_repo=""
-	if [ "${major_version}" -eq "7" ]; then
-		enable_repo="base,epel"
+	set -- "--disablerepo=*" "--enablerepo=baseos,appstream"
+
+	if grep -qi 'ID="rhel"' /etc/os-release && {
+		[ "${major_version}" -eq "7" ] || [ "${major_version}" -eq "8" ] ||
+			[ "${major_version}" -eq "9" ]
+	}; then
+		# Do not set --disablerepo / --enablerepo on RedHat, due to complex repo names
+		# clear array
+		set --
+	elif [ "${major_version}" -eq "7" ]; then
+		set -- "--disablerepo=*" "--enablerepo=base,epel"
 	elif [ "${major_version}" -eq "8" ]; then
-		enable_repo="baseos"
+		# use defaults
+		true
+	elif [ "${major_version}" -eq "9" ]; then
+		# use defaults
+		true
 	else
 		echo "Unsupported version of centos/RHEL/Rocky"
 		return 1
 	fi
-	yum install --disablerepo="*" --enablerepo="${enable_repo}" -y python3-pip
+	yum install "$@" -y python3-pip
 }
 
 # Install python3 with the apt package manager. Updates python_path to the

From db7f23d11fdc0770bcf6ddd405221f6cccae1bec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Tue, 7 Jan 2025 18:42:11 +0000
Subject: [PATCH 116/140] Add install_ansible tests

---
 .../daily-tests/blueprints/ansible-vm.yaml    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml b/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
index 2b5ac93131..2e2f8608aa 100644
--- a/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/ansible-vm.yaml
@@ -103,6 +103,42 @@ deployment_groups:
         family: rocky-linux-8-optimized-gcp
         project: rocky-linux-cloud
 
+  - id: workstation-rocky-9
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: rocky9
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: rocky-linux-9-optimized-gcp
+        project: rocky-linux-cloud
+
+  - id: workstation-rhel-8
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: rhel8
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: rhel-8
+        project: rhel-cloud
+
+  - id: workstation-rhel-9
+    source: modules/compute/vm-instance
+    use:
+    - network1
+    - startup-script
+    settings:
+      name_prefix: rhel9
+      add_deployment_name_before_prefix: true
+      instance_image:
+        family: rhel-9
+        project: rhel-cloud
+
   - id: wait-for-startup
     source: community/modules/scripts/wait-for-startup
     settings:
@@ -112,4 +148,7 @@ deployment_groups:
       - $(workstation-ubuntu-2204.name[0])
       - $(workstation-debian.name[0])
       - $(workstation-rocky-8.name[0])
+      - $(workstation-rocky-9.name[0])
+      - $(workstation-rhel-8.name[0])
+      - $(workstation-rhel-9.name[0])
       timeout: 7200

From d91e32f17d96f609583a88d3f5fcda8477f7b9be Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Tue, 7 Jan 2025 20:38:01 +0000
Subject: [PATCH 117/140] Add blueprint name before build ID

---
 .../blueprints/slurm-simple-reconfig.yaml            |  2 +-
 .../blueprints/slurm-simple.yaml                     |  2 +-
 .../blueprints/topology-test.yaml                    |  2 +-
 tools/python-integration-tests/deployment.py         | 12 +++++++-----
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml b/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
index a014c03c0f..c4ddca658d 100644
--- a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
+++ b/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 ---
-blueprint_name: slurm-test
+blueprint_name: slurm-simple
 
 vars:
   project_id: ## Set GCP Project ID Here ##
diff --git a/tools/python-integration-tests/blueprints/slurm-simple.yaml b/tools/python-integration-tests/blueprints/slurm-simple.yaml
index b0451f3cc9..a6be38aee5 100644
--- a/tools/python-integration-tests/blueprints/slurm-simple.yaml
+++ b/tools/python-integration-tests/blueprints/slurm-simple.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 ---
-blueprint_name: slurm-test
+blueprint_name: slurm-simple
 
 vars:
   project_id: ## Set GCP Project ID Here ##
diff --git a/tools/python-integration-tests/blueprints/topology-test.yaml b/tools/python-integration-tests/blueprints/topology-test.yaml
index 0dbf627e6c..8e779f57bd 100644
--- a/tools/python-integration-tests/blueprints/topology-test.yaml
+++ b/tools/python-integration-tests/blueprints/topology-test.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 ---
-blueprint_name: topology-test
+blueprint_name: topology
 
 vars:
   project_id: ## Set GCP Project ID Here ##
diff --git a/tools/python-integration-tests/deployment.py b/tools/python-integration-tests/deployment.py
index c0ca562656..c0725d3cc4 100644
--- a/tools/python-integration-tests/deployment.py
+++ b/tools/python-integration-tests/deployment.py
@@ -21,13 +21,14 @@
 
 class Deployment:
     def __init__(self, blueprint: str):
-        self.blueprint_yaml = blueprint
+        self.blueprint_file = blueprint
         self.state_bucket = "daily-tests-tf-state"
         self.project_id = None
         self.workspace = None
         self.instance_name = None
         self.username = None
         self.deployment_name = None
+        self.blueprint_name = None
         self.zone = None
 
     def run_command(self, cmd: str, err_msg: str = None) -> subprocess.CompletedProcess:
@@ -39,6 +40,7 @@ def parse_blueprint(self, file_path: str):
         with open(file_path, 'r') as file:
             content = yaml.safe_load(file)
         self.zone = content["vars"]["zone"]
+        self.blueprint_name = content["blueprint_name"]
 
     def get_posixAccount_info(self):
         # Extract the username from posixAccounts
@@ -53,14 +55,14 @@ def get_posixAccount_info(self):
     def generate_uniq_deployment_name(self):
         BUILD_ID = os.environ.get('BUILD_ID')
         if BUILD_ID:
-            return BUILD_ID[:6]
+            return "-".join([self.blueprint_name, str(BUILD_ID[:6])])
         else:
-            return str(uuid.uuid4())[:6]
+            return "-".join([self.blueprint_name, str(uuid.uuid4())[:6]])
 
     def set_deployment_variables(self):
         self.workspace = os.path.abspath(os.getcwd().strip())
+        self.parse_blueprint(self.blueprint_file)
         self.deployment_name = self.generate_uniq_deployment_name()
-        self.parse_blueprint(self.blueprint_yaml)
         self.get_posixAccount_info()
         self.instance_name = self.deployment_name.replace("-", "")[:10] + "-slurm-login-001"
 
@@ -70,7 +72,7 @@ def create_blueprint(self):
               "create",
               "-l",
               "ERROR",
-              self.blueprint_yaml,
+              self.blueprint_file,
               "--backend-config",
               f"bucket={self.state_bucket}",
               "--vars",

From 551a6c45bd9496da46e8345254b979696dd22d1b Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Tue, 7 Jan 2025 21:04:11 +0000
Subject: [PATCH 118/140] Add shielded instance config to default pool (deleted
 always)

---
 modules/scheduler/gke-cluster/main.tf | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 1043021f29..3f27b34558 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -202,6 +202,13 @@ resource "google_container_cluster" "gke_cluster" {
     update = var.timeout_update
   }
 
+  node_config {
+    shielded_instance_config {
+      enable_secure_boot          = var.system_node_pool_enable_secure_boot
+      enable_integrity_monitoring = true
+    }
+  }
+
   lifecycle {
     # Ignore all changes to the default node pool. It's being removed after creation.
     ignore_changes = [

From f970ffc50096a1316c73dee15e61fb2ffa435319 Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Wed, 8 Jan 2025 05:00:30 +0000
Subject: [PATCH 119/140] added variable and updated google_storage_bucket
 resource to support hierarchical namespace

---
 community/modules/file-system/cloud-storage-bucket/main.tf  | 3 +++
 .../modules/file-system/cloud-storage-bucket/variables.tf   | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/community/modules/file-system/cloud-storage-bucket/main.tf b/community/modules/file-system/cloud-storage-bucket/main.tf
index b33b43d7df..62398c135c 100644
--- a/community/modules/file-system/cloud-storage-bucket/main.tf
+++ b/community/modules/file-system/cloud-storage-bucket/main.tf
@@ -41,6 +41,9 @@ resource "google_storage_bucket" "bucket" {
   storage_class               = "REGIONAL"
   labels                      = local.labels
   force_destroy               = var.force_destroy
+  hierarchical_namespace      = {
+                                enabled = var.enable_hierarchical_namespace
+  }
 }
 
 resource "google_storage_bucket_iam_binding" "viewers" {
diff --git a/community/modules/file-system/cloud-storage-bucket/variables.tf b/community/modules/file-system/cloud-storage-bucket/variables.tf
index 101117e6af..7ec2817585 100644
--- a/community/modules/file-system/cloud-storage-bucket/variables.tf
+++ b/community/modules/file-system/cloud-storage-bucket/variables.tf
@@ -82,3 +82,9 @@ variable "viewers" {
     ])
   }
 }
+
+variable "enable_hierarchical_namespace" {
+  description = "If true, enables hierarchical namespace for the bucket."
+  type = bool
+  default = false
+}

From 9f92c96d0135c905ea78c44f51dfc71bc6512759 Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Wed, 8 Jan 2025 08:45:41 +0000
Subject: [PATCH 120/140] updated readme

---
 community/modules/file-system/cloud-storage-bucket/README.md  | 1 +
 community/modules/file-system/cloud-storage-bucket/main.tf    | 4 ++--
 .../modules/file-system/cloud-storage-bucket/variables.tf     | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/community/modules/file-system/cloud-storage-bucket/README.md b/community/modules/file-system/cloud-storage-bucket/README.md
index a6d5071aa6..3d85b38b0d 100644
--- a/community/modules/file-system/cloud-storage-bucket/README.md
+++ b/community/modules/file-system/cloud-storage-bucket/README.md
@@ -135,6 +135,7 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment; used as part of name of the GCS bucket. | `string` | n/a | yes |
+| <a name="input_enable_hierarchical_namespace"></a> [enable\_hierarchical\_namespace](#input\_enable\_hierarchical\_namespace) | If true, enables hierarchical namespace for the bucket. | `bool` | `false` | no |
 | <a name="input_force_destroy"></a> [force\_destroy](#input\_force\_destroy) | If true will destroy bucket with all objects stored within. | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to the GCS bucket. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_local_mount"></a> [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/mnt"` | no |
diff --git a/community/modules/file-system/cloud-storage-bucket/main.tf b/community/modules/file-system/cloud-storage-bucket/main.tf
index 62398c135c..60822cde1f 100644
--- a/community/modules/file-system/cloud-storage-bucket/main.tf
+++ b/community/modules/file-system/cloud-storage-bucket/main.tf
@@ -41,8 +41,8 @@ resource "google_storage_bucket" "bucket" {
   storage_class               = "REGIONAL"
   labels                      = local.labels
   force_destroy               = var.force_destroy
-  hierarchical_namespace      = {
-                                enabled = var.enable_hierarchical_namespace
+  hierarchical_namespace {
+    enabled = var.enable_hierarchical_namespace
   }
 }
 
diff --git a/community/modules/file-system/cloud-storage-bucket/variables.tf b/community/modules/file-system/cloud-storage-bucket/variables.tf
index 7ec2817585..727529888a 100644
--- a/community/modules/file-system/cloud-storage-bucket/variables.tf
+++ b/community/modules/file-system/cloud-storage-bucket/variables.tf
@@ -85,6 +85,6 @@ variable "viewers" {
 
 variable "enable_hierarchical_namespace" {
   description = "If true, enables hierarchical namespace for the bucket."
-  type = bool
-  default = false
+  type        = bool
+  default     = false
 }

From 3fe47dc6e61e6cf0ac0e040695b079e76a5c5788 Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Wed, 8 Jan 2025 19:08:45 +0000
Subject: [PATCH 121/140] fix merge conflicts correctly

---
 examples/gke-a3-ultragpu/README.md            |  3 +++
 .../gke-a3-ultragpu-deployment.yaml           | 14 +++++++-------
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 16 ----------------
 examples/gke-a3-ultragpu/mglru-disable.yaml   | 16 ++++++++--------
 examples/gke-a3-ultragpu/nccl-installer.yaml  | 19 +++++++++++++++++--
 .../gke-a3-ultragpu/nccl-jobset-example.yaml  |  2 +-
 .../gke-a3-ultragpu/nccl-test-32-node.yaml    |  2 +-
 examples/gke-a3-ultragpu/nccl-test.yaml       |  4 ++--
 tools/validate_configs/validate_configs.sh    |  3 ++-
 9 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/examples/gke-a3-ultragpu/README.md b/examples/gke-a3-ultragpu/README.md
index a7831af889..ca269a1bfb 100644
--- a/examples/gke-a3-ultragpu/README.md
+++ b/examples/gke-a3-ultragpu/README.md
@@ -1 +1,4 @@
 Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.
+
+If you are unable to access these documents, please contact your
+[Technical Account Manager (TAM)](https://cloud.google.com/tam).
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
index ae897e23d7..0e475ec2d6 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
@@ -16,15 +16,15 @@
 terraform_backend_defaults:
   type: gcs
   configuration:
-    bucket: gke-a3u-manual-test
+    bucket: BUCKET_NAME
 
 vars:
   deployment_name: gke-a3-ultra
-  project_id: hpc-toolkit-dev
-  region: europe-west1
-  zone: europe-west1-b
-  authorized_cidr: 0.0.0.0/0
+  project_id: PROJECT_ID
+  region: COMPUTE_REGION
+  zone: COMPUTE_ZONE
+  authorized_cidr: <IP_ADDRESS>/<SUFFIX>
   # In order to not target a BLOCK_NAME, extended_reservation can be inputted as
   # extended_reservation: RESERVATION_NAME
-  extended_reservation: slurm-dev-gcp-a3u-gsc
-  static_node_count: 0
+  extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
+  static_node_count: NODE_COUNT
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 20699334df..8751114b63 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -32,22 +32,6 @@ vars:
   system_node_pool_disk_size_gb: 200
   a3ultra_node_pool_disk_size_gb: 100
 
-terraform_providers:
-  google:
-    source: hashicorp/google
-    version: 6.13.0
-    configuration:
-      project: $(vars.project_id)
-      region: $(vars.region)
-      zone: $(vars.zone)
-  google-beta:
-    source: hashicorp/google-beta
-    version: 6.13.0
-    configuration:
-      project: $(vars.project_id)
-      region: $(vars.region)
-      zone: $(vars.zone)
-
 deployment_groups:
 - group: primary
   modules:
diff --git a/examples/gke-a3-ultragpu/mglru-disable.yaml b/examples/gke-a3-ultragpu/mglru-disable.yaml
index f0bc1c8caf..2f92b20413 100644
--- a/examples/gke-a3-ultragpu/mglru-disable.yaml
+++ b/examples/gke-a3-ultragpu/mglru-disable.yaml
@@ -40,14 +40,14 @@ spec:
         - -c
         - |
           echo n | tee /sys/kernel/mm/lru_gen/enabled
-          sysctl -w net.ipv4.conf.eth2.log_martians=0
-          sysctl -w net.ipv4.conf.eth3.log_martians=0
-          sysctl -w net.ipv4.conf.eth4.log_martians=0
-          sysctl -w net.ipv4.conf.eth5.log_martians=0
-          sysctl -w net.ipv4.conf.eth6.log_martians=0
-          sysctl -w net.ipv4.conf.eth7.log_martians=0
-          sysctl -w net.ipv4.conf.eth8.log_martians=0
-          sysctl -w net.ipv4.conf.eth9.log_martians=0
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
           sleep infinity
         volumeMounts:
         - name: sys-kernel-mm-lru-gen
diff --git a/examples/gke-a3-ultragpu/nccl-installer.yaml b/examples/gke-a3-ultragpu/nccl-installer.yaml
index 0227658184..1186759a7b 100644
--- a/examples/gke-a3-ultragpu/nccl-installer.yaml
+++ b/examples/gke-a3-ultragpu/nccl-installer.yaml
@@ -54,7 +54,23 @@ spec:
         hostPath:
           path: /home/kubernetes/bin/gib
       initContainers:
-      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+      - name: disable-log-martian
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
         name: nccl-rdma-installer
         resources:
           requests:
@@ -73,7 +89,6 @@ spec:
           /scripts/container_entry.sh install --install-nccl
           cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
           cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
-          ibv_devinfo || exit 1
           echo "installation finishes"
       containers:
       - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
diff --git a/examples/gke-a3-ultragpu/nccl-jobset-example.yaml b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
index 4e3a437604..bb0d8cd787 100644
--- a/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
+++ b/examples/gke-a3-ultragpu/nccl-jobset-example.yaml
@@ -126,7 +126,7 @@ spec:
             - name: nccl
               stdin: true
               tty: true
-              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3
               securityContext:
                 privileged: true
               env:
diff --git a/examples/gke-a3-ultragpu/nccl-test-32-node.yaml b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
index 3f51ecd239..0f70a625d9 100644
--- a/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
+++ b/examples/gke-a3-ultragpu/nccl-test-32-node.yaml
@@ -126,7 +126,7 @@ spec:
             - name: nccl
               stdin: true
               tty: true
-              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+              image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3
               securityContext:
                 privileged: true
               env:
diff --git a/examples/gke-a3-ultragpu/nccl-test.yaml b/examples/gke-a3-ultragpu/nccl-test.yaml
index 9b4fd881b7..04cb9beca2 100644
--- a/examples/gke-a3-ultragpu/nccl-test.yaml
+++ b/examples/gke-a3-ultragpu/nccl-test.yaml
@@ -64,7 +64,7 @@ spec:
       medium: "Memory"
       sizeLimit: 250Gi
   containers:
-  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3
     name: test
     volumeMounts:
     - name: library-dir-host
@@ -120,7 +120,7 @@ spec:
       medium: "Memory"
       sizeLimit: 250Gi
   containers:
-  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2
+  - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3
     name: test
     volumeMounts:
     - name: library-dir-host
diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh
index a414f2eb33..57929a4ded 100755
--- a/tools/validate_configs/validate_configs.sh
+++ b/tools/validate_configs/validate_configs.sh
@@ -120,7 +120,8 @@ check_background() {
 	fi
 }
 
-CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/gke-a3-ultragpu/*')
+CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/machine-learning/a3-ultragpu-8g/*' -not -path 'examples/gke-a3-ultragpu/*' -not -path 'examples/hypercompute_clusters/*')
+
 cwd=$(pwd)
 NPROCS=${NPROCS:-$(nproc)}
 echo "Running tests in $NPROCS processes"

From aaf8527527b72f5c7acd90f91f6d1dd31693fd75 Mon Sep 17 00:00:00 2001
From: Alyssa <alyssasm@google.com>
Date: Wed, 8 Jan 2025 22:47:20 +0000
Subject: [PATCH 122/140] Support concurrent tests

---
 .../builds/slurm-gcp-v6-reconfig-size.yaml    |  2 +-
 .../slurm-gcp-v6-simple-job-completion.yaml   |  2 +-
 .../daily-tests/validate_tests_metadata.py    |  2 +-
 ...econfig.yaml => slurm-reconfig-after.yaml} |  2 +-
 .../blueprints/slurm-reconfig-before.yaml     | 58 +++++++++++++++++++
 .../slurm_reconfig_size.py                    |  4 +-
 tools/python-integration-tests/ssh.py         | 18 ++++--
 tools/python-integration-tests/test.py        |  4 +-
 8 files changed, 80 insertions(+), 12 deletions(-)
 rename tools/python-integration-tests/blueprints/{slurm-simple-reconfig.yaml => slurm-reconfig-after.yaml} (98%)
 create mode 100644 tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml

diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
index d67fd64e1e..f51e390e86 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-reconfig-size.yaml
@@ -23,7 +23,7 @@ tags:
 
 timeout: 14400s  # 4hr
 steps:
-- id: slurm-topology
+- id: slurm-reconfig
   name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
   entrypoint: /bin/bash
   args:
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
index 7a8a8f3a26..ca00a7ac7e 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-simple-job-completion.yaml
@@ -23,7 +23,7 @@ tags:
 
 timeout: 14400s  # 4hr
 steps:
-- id: slurm-topology
+- id: slurm-job-completion
   name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
   entrypoint: /bin/bash
   args:
diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py
index 16905a1f72..c734e984e7 100644
--- a/tools/cloud-build/daily-tests/validate_tests_metadata.py
+++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py
@@ -73,7 +73,7 @@ def get_blueprint(build_path: str) -> Optional[str]:
         f"{BUILDS_DIR}/chrome-remote-desktop.yaml": "tools/cloud-build/daily-tests/blueprints/crd-default.yaml",
         f"{BUILDS_DIR}/chrome-remote-desktop-ubuntu.yaml": "tools/cloud-build/daily-tests/blueprints/crd-ubuntu.yaml",
         f"{BUILDS_DIR}/gcluster-dockerfile.yaml": "tools/cloud-build/daily-tests/blueprints/e2e.yaml",
-        f"{BUILDS_DIR}/slurm-gcp-v6-reconfig-size.yaml": "tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml",
+        f"{BUILDS_DIR}/slurm-gcp-v6-reconfig-size.yaml": "tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml",
         f"{BUILDS_DIR}/slurm-gcp-v6-simple-job-completion.yaml": "tools/python-integration-tests/blueprints/slurm-simple.yaml",
         f"{BUILDS_DIR}/slurm-gcp-v6-topology.yaml": "tools/python-integration-tests/blueprints/topology-test.yaml",
     }
diff --git a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml b/tools/python-integration-tests/blueprints/slurm-reconfig-after.yaml
similarity index 98%
rename from tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
rename to tools/python-integration-tests/blueprints/slurm-reconfig-after.yaml
index c4ddca658d..160cdb58ba 100644
--- a/tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml
+++ b/tools/python-integration-tests/blueprints/slurm-reconfig-after.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 ---
-blueprint_name: slurm-simple
+blueprint_name: slurm-reconfig
 
 vars:
   project_id: ## Set GCP Project ID Here ##
diff --git a/tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml b/tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml
new file mode 100644
index 0000000000..4552290e64
--- /dev/null
+++ b/tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml
@@ -0,0 +1,58 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+blueprint_name: slurm-reconfig
+
+vars:
+  project_id: ## Set GCP Project ID Here ##
+  deployment_name: ## Set Deployment Name Here ##
+  region: us-central1
+  zone: us-central1-a
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network
+    source: modules/network/pre-existing-vpc
+
+  - id: nodeset
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [network]
+    settings:
+      bandwidth_tier: gvnic_enabled
+      machine_type: c2-standard-4
+      node_count_dynamic_max: 5
+      allow_automatic_updates: false
+
+  - id: partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use: [nodeset]
+    settings:
+      is_default: true
+      partition_name: compute
+
+  - id: slurm_login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    use: [network]
+    settings:
+      machine_type: n1-standard-4
+      enable_login_public_ips: true
+
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    use: [network, slurm_login, partition]
+    settings:
+      machine_type: n1-standard-4
+      enable_controller_public_ips: true
diff --git a/tools/python-integration-tests/slurm_reconfig_size.py b/tools/python-integration-tests/slurm_reconfig_size.py
index 597eac1756..1824632585 100644
--- a/tools/python-integration-tests/slurm_reconfig_size.py
+++ b/tools/python-integration-tests/slurm_reconfig_size.py
@@ -21,8 +21,8 @@
 class SlurmReconfigureSize(SlurmTest):
     # Class to test simple reconfiguration
     def __init__(self, deployment):
-        super().__init__(Deployment("tools/python-integration-tests/blueprints/slurm-simple.yaml"))
-        self.reconfig_blueprint = "tools/python-integration-tests/blueprints/slurm-simple-reconfig.yaml"
+        super().__init__(Deployment("tools/python-integration-tests/blueprints/slurm-reconfig-before.yaml"))
+        self.reconfig_blueprint = "tools/python-integration-tests/blueprints/slurm-reconfig-after.yaml"
     
     def runTest(self):
         # Check 5 nodes are available
diff --git a/tools/python-integration-tests/ssh.py b/tools/python-integration-tests/ssh.py
index b051020ad6..bcb173ece9 100644
--- a/tools/python-integration-tests/ssh.py
+++ b/tools/python-integration-tests/ssh.py
@@ -14,6 +14,7 @@
 
 import os
 import subprocess
+import socket
 import time
 import paramiko
 
@@ -31,15 +32,23 @@ def __init__(self):
             self.tunnel = None
             self.key = None
             self.ssh_client = None
+            self.local_port = None
 
     def run_command(self, cmd: str) -> subprocess.CompletedProcess:
         res = subprocess.run(cmd, text=True, check=True, capture_output=True)
 
-    def create_tunnel(self, instance_name, port, project_id, zone):
+    def get_available_port(self):
+        sock = socket.socket()
+        sock.bind(('', 0))
+        port = sock.getsockname()[1]
+        sock.close()
+        return port
+
+    def create_tunnel(self, instance_name, project_id, zone):
         iap_tunnel_cmd = [
             "gcloud", "compute", "start-iap-tunnel", instance_name,
             "22", "--project", project_id, "--zone", zone,
-            f"--local-host-port=localhost:{port}"
+            f"--local-host-port=localhost:{self.local_port}"
         ]
 
         self.tunnel = subprocess.Popen(iap_tunnel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -59,11 +68,12 @@ def get_keypath(self):
 
         return key_path
 
-    def setup_connection(self, instance_name, port, project_id, zone):
+    def setup_connection(self, instance_name, project_id, zone):
         self.ssh_client = paramiko.SSHClient()
         self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
         self.key = paramiko.RSAKey.from_private_key_file(self.get_keypath())
-        self.create_tunnel(instance_name, port, project_id, zone)
+        self.local_port = self.get_available_port()
+        self.create_tunnel(instance_name, project_id, zone)
 
     def close(self):
         # Closes existing SSH connection and tunnel
diff --git a/tools/python-integration-tests/test.py b/tools/python-integration-tests/test.py
index a02fa6ed65..a712f3c6c3 100644
--- a/tools/python-integration-tests/test.py
+++ b/tools/python-integration-tests/test.py
@@ -43,9 +43,9 @@ class SlurmTest(Test):
     # Base class for Slurm-specific tests.
     def ssh(self, hostname):
         self.ssh_manager = SSHManager()
-        self.ssh_manager.setup_connection(hostname, 10022, self.deployment.project_id, self.deployment.zone)
+        self.ssh_manager.setup_connection(hostname, self.deployment.project_id, self.deployment.zone)
         self.ssh_client = self.ssh_manager.ssh_client
-        self.ssh_client.connect("localhost", 10022, username=self.deployment.username, pkey=self.ssh_manager.key)
+        self.ssh_client.connect("localhost", self.ssh_manager.local_port, username=self.deployment.username, pkey=self.ssh_manager.key)
 
     def close_ssh(self):
         self.ssh_manager.close()

From 7983eb50e07bd474cfef1e41ebc7b8be69f2d8f3 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Mon, 6 Jan 2025 22:39:54 +0000
Subject: [PATCH 123/140] Enable optional creation of cloud router/nat for vpcs

---
 modules/compute/vm-instance/README.md    |  2 +-
 modules/compute/vm-instance/variables.tf |  2 +-
 modules/network/vpc/README.md            |  3 +++
 modules/network/vpc/main.tf              | 22 +++++++++++++++++++---
 modules/network/vpc/variables.tf         | 12 ++++++++++++
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md
index e75b70865d..8cb04bbf6d 100644
--- a/modules/compute/vm-instance/README.md
+++ b/modules/compute/vm-instance/README.md
@@ -225,7 +225,7 @@ limitations under the License.
 | <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no |
 | <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no |
 | <a name="input_name_prefix"></a> [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.<br/>If not supplied, `deployment_name` will be used.<br/>When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,<br/>then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no |
-| <a name="input_network_interfaces"></a> [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform<br/>network\_interface block of google\_compute\_instance. For descriptions of the<br/>subfields or more information see the documentation:<br/>https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface<br/><br/>**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and<br/>`subnetwork_self_link` will be ignored, even if they are provided through<br/>the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply<br/>to network interfaces defined in this variable.<br/><br/>Subfields:<br/>network            (string, required if subnetwork is not supplied)<br/>subnetwork         (string, required if network is not supplied)<br/>subnetwork\_project (string, optional)<br/>network\_ip         (string, optional)<br/>nic\_type           (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])<br/>stack\_type         (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])<br/>queue\_count        (number, optional)<br/>access\_config      (object, optional)<br/>ipv6\_access\_config (object, optional)<br/>alias\_ip\_range     (list(object), optional) | <pre>list(object({<br/>    network            = string,<br/>    subnetwork         = string,<br/>    subnetwork_project = string,<br/>    network_ip         = string,<br/>    nic_type           = string,<br/>    stack_type         = string,<br/>    queue_count        = number,<br/>    access_config = list(object({<br/>      nat_ip                 = string,<br/>      public_ptr_domain_name = string,<br/>      network_tier           = string<br/>    })),<br/>    ipv6_access_config = list(object({<br/>      public_ptr_domain_name = string,<br/>      network_tier           = string<br/>    })),<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string,<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
+| <a name="input_network_interfaces"></a> [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform<br/>network\_interface block of google\_compute\_instance. For descriptions of the<br/>subfields or more information see the documentation:<br/>https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface<br/><br/>**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and<br/>`subnetwork_self_link` will be ignored, even if they are provided through<br/>the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply<br/>to network interfaces defined in this variable.<br/><br/>Subfields:<br/>network            (string, required if subnetwork is not supplied)<br/>subnetwork         (string, required if network is not supplied)<br/>subnetwork\_project (string, optional)<br/>network\_ip         (string, optional)<br/>nic\_type           (string, optional, choose from ["GVNIC", "VIRTIO\_NET", "MRDMA", "IRDMA"])<br/>stack\_type         (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])<br/>queue\_count        (number, optional)<br/>access\_config      (object, optional)<br/>ipv6\_access\_config (object, optional)<br/>alias\_ip\_range     (list(object), optional) | <pre>list(object({<br/>    network            = string,<br/>    subnetwork         = string,<br/>    subnetwork_project = string,<br/>    network_ip         = string,<br/>    nic_type           = string,<br/>    stack_type         = string,<br/>    queue_count        = number,<br/>    access_config = list(object({<br/>      nat_ip                 = string,<br/>      public_ptr_domain_name = string,<br/>      network_tier           = string<br/>    })),<br/>    ipv6_access_config = list(object({<br/>      public_ptr_domain_name = string,<br/>      network_tier           = string<br/>    })),<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string,<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_network_self_link"></a> [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. Can use "default" for the default network. | `string` | `null` | no |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  }))</pre> | `[]` | no |
 | <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `null` | no |
diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf
index 1c32348587..7eae8fcf85 100644
--- a/modules/compute/vm-instance/variables.tf
+++ b/modules/compute/vm-instance/variables.tf
@@ -185,7 +185,7 @@ variable "network_interfaces" {
     subnetwork         (string, required if network is not supplied)
     subnetwork_project (string, optional)
     network_ip         (string, optional)
-    nic_type           (string, optional, choose from ["GVNIC", "VIRTIO_NET"])
+    nic_type           (string, optional, choose from ["GVNIC", "VIRTIO_NET", "MRDMA", "IRDMA"])
     stack_type         (string, optional, choose from ["IPV4_ONLY", "IPV4_IPV6"])
     queue_count        (number, optional)
     access_config      (object, optional)
diff --git a/modules/network/vpc/README.md b/modules/network/vpc/README.md
index ff0fd46c95..e63805b6b1 100644
--- a/modules/network/vpc/README.md
+++ b/modules/network/vpc/README.md
@@ -181,6 +181,7 @@ limitations under the License.
 
 | Name | Type |
 |------|------|
+| [terraform_data.cloud_nat_validation](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
 | [terraform_data.secondary_ranges_validation](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
 
 ## Inputs
@@ -192,6 +193,8 @@ limitations under the License.
 | <a name="input_default_primary_subnetwork_size"></a> [default\_primary\_subnetwork\_size](#input\_default\_primary\_subnetwork\_size) | The size, in CIDR bits, of the default primary subnetwork unless explicitly defined in var.subnetworks | `number` | `15` | no |
 | <a name="input_delete_default_internet_gateway_routes"></a> [delete\_default\_internet\_gateway\_routes](#input\_delete\_default\_internet\_gateway\_routes) | If set, ensure that all routes within the network specified whose names begin with 'default-route' and with a next hop of 'default-internet-gateway' are deleted | `bool` | `false` | no |
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes |
+| <a name="input_enable_cloud_nat"></a> [enable\_cloud\_nat](#input\_enable\_cloud\_nat) | Enable the creation of Cloud NATs. | `bool` | `true` | no |
+| <a name="input_enable_cloud_router"></a> [enable\_cloud\_router](#input\_enable\_cloud\_router) | Enable the creation of a Cloud Router for your VPC. For more information on Cloud Routers see https://cloud.google.com/network-connectivity/docs/router/concepts/overview | `bool` | `true` | no |
 | <a name="input_enable_iap_rdp_ingress"></a> [enable\_iap\_rdp\_ingress](#input\_enable\_iap\_rdp\_ingress) | Enable a firewall rule to allow Windows Remote Desktop Protocol access using IAP tunnels | `bool` | `false` | no |
 | <a name="input_enable_iap_ssh_ingress"></a> [enable\_iap\_ssh\_ingress](#input\_enable\_iap\_ssh\_ingress) | Enable a firewall rule to allow SSH access using IAP tunnels | `bool` | `true` | no |
 | <a name="input_enable_iap_winrm_ingress"></a> [enable\_iap\_winrm\_ingress](#input\_enable\_iap\_winrm\_ingress) | Enable a firewall rule to allow Windows Remote Management (WinRM) access using IAP tunnels | `bool` | `false` | no |
diff --git a/modules/network/vpc/main.tf b/modules/network/vpc/main.tf
index 3ad533a957..3ac54bf22d 100644
--- a/modules/network/vpc/main.tf
+++ b/modules/network/vpc/main.tf
@@ -64,7 +64,8 @@ locals {
   ]
 
   # gather the unique regions for purposes of creating Router/NAT
-  regions = distinct([for subnet in local.subnetworks : subnet.subnet_region])
+  cloud_router_regions = var.enable_cloud_router ? distinct([for subnet in local.subnetworks : subnet.subnet_region]) : []
+  cloud_nat_regions    = var.enable_cloud_nat ? local.cloud_router_regions : []
 
   # this comprehension should have 1 and only 1 match
   output_primary_subnetwork               = one([for k, v in module.vpc.subnets : v if k == "${local.subnetworks[0].subnet_region}/${local.subnetworks[0].subnet_name}"])
@@ -177,6 +178,17 @@ module "vpc" {
   network_profile                        = var.network_profile
 }
 
+resource "terraform_data" "cloud_nat_validation" {
+  lifecycle {
+    precondition {
+      condition     = var.enable_cloud_router == true || var.enable_cloud_nat == false
+      error_message = <<-EOD
+        "Cannot have Cloud NAT without a Cloud Router. If you desire Cloud NAT functionality please set `enable_cloud_router` to true."
+      EOD
+    }
+  }
+}
+
 # This use of the module may appear odd when var.ips_per_nat = 0. The module
 # will be called for all regions with subnetworks but names will be set to the
 # empty list. This is a perfectly valid value (the default!). In this scenario,
@@ -188,7 +200,9 @@ module "nat_ip_addresses" {
   source  = "terraform-google-modules/address/google"
   version = "~> 4.1"
 
-  for_each = toset(local.regions)
+  depends_on = [terraform_data.cloud_nat_validation]
+
+  for_each = toset(local.cloud_nat_regions)
 
   project_id = var.project_id
   region     = each.value
@@ -203,7 +217,9 @@ module "cloud_router" {
   source  = "terraform-google-modules/cloud-router/google"
   version = "~> 6.0"
 
-  for_each = toset(local.regions)
+  depends_on = [terraform_data.cloud_nat_validation]
+
+  for_each = toset(local.cloud_router_regions)
 
   project = var.project_id
   name    = "${local.network_name}-router"
diff --git a/modules/network/vpc/variables.tf b/modules/network/vpc/variables.tf
index f4e7321784..a74a78e577 100644
--- a/modules/network/vpc/variables.tf
+++ b/modules/network/vpc/variables.tf
@@ -228,6 +228,18 @@ variable "enable_internal_traffic" {
   default     = true
 }
 
+variable "enable_cloud_router" {
+  type        = bool
+  description = "Enable the creation of a Cloud Router for your VPC. For more information on Cloud Routers see https://cloud.google.com/network-connectivity/docs/router/concepts/overview"
+  default     = true
+}
+
+variable "enable_cloud_nat" {
+  type        = bool
+  description = "Enable the creation of Cloud NATs."
+  default     = true
+}
+
 variable "extra_iap_ports" {
   type        = list(string)
   description = "A list of TCP ports for which to create firewall rules that enable IAP for TCP forwarding (use dedicated enable_iap variables for standard ports)"

From 38570da598f7f52cc6b64bfd36e2ad4c61c486c3 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Mon, 6 Jan 2025 20:46:54 +0000
Subject: [PATCH 124/140] Remove Slurm-gcp v5 modules and update documentation

---
 cmd/create.go                                 |  17 -
 .../gcs_bucket/webserver/startup.sh           |   1 -
 .../schedmd-slurm-gcp-v5-node-group/README.md | 174 -----
 .../gpu_definition.tf                         |  58 --
 .../schedmd-slurm-gcp-v5-node-group/main.tf   |  96 ---
 .../metadata.yaml                             |  18 -
 .../outputs.tf                                |  29 -
 .../source_image_logic.tf                     |  78 --
 .../variables.tf                              | 442 ------------
 .../versions.tf                               |  28 -
 .../README.md                                 | 104 ---
 .../main.tf                                   |  46 --
 .../metadata.yaml                             |  18 -
 .../outputs.tf                                |  23 -
 .../variables.tf                              |  94 ---
 .../versions.tf                               |  19 -
 .../schedmd-slurm-gcp-v5-partition/README.md  | 192 -----
 .../schedmd-slurm-gcp-v5-partition/main.tf    |  88 ---
 .../metadata.yaml                             |  19 -
 .../schedmd-slurm-gcp-v5-partition/outputs.tf |  42 --
 .../variables.tf                              | 270 -------
 .../versions.tf                               |  28 -
 .../schedmd-slurm-gcp-v5-controller/README.md | 307 --------
 .../etc/htc-slurm.conf.tpl                    |  67 --
 .../etc/htc-slurmdbd.conf.tpl                 |  34 -
 .../etc/long-prolog-slurm.conf.tpl            |  70 --
 .../gpu_definition.tf                         |  58 --
 .../schedmd-slurm-gcp-v5-controller/main.tf   | 137 ----
 .../metadata.yaml                             |  22 -
 .../outputs.tf                                |  30 -
 .../source_image_logic.tf                     |  78 --
 .../variables.tf                              | 671 ------------------
 .../versions.tf                               |  28 -
 .../schedmd-slurm-gcp-v5-hybrid/README.md     | 230 ------
 .../schedmd-slurm-gcp-v5-hybrid/main.tf       |  59 --
 .../schedmd-slurm-gcp-v5-hybrid/metadata.yaml |  20 -
 .../schedmd-slurm-gcp-v5-hybrid/variables.tf  | 344 ---------
 .../schedmd-slurm-gcp-v5-hybrid/versions.tf   |  19 -
 .../schedmd-slurm-gcp-v5-login/README.md      | 153 ----
 .../gpu_definition.tf                         |  58 --
 .../schedmd-slurm-gcp-v5-login/main.tf        | 116 ---
 .../schedmd-slurm-gcp-v5-login/metadata.yaml  |  19 -
 .../source_image_logic.tf                     |  78 --
 .../schedmd-slurm-gcp-v5-login/variables.tf   | 429 -----------
 .../schedmd-slurm-gcp-v5-login/versions.tf    |  28 -
 .../schedmd-slurm-gcp-v6-controller/README.md |   9 +-
 .../schedmd-slurm-gcp-v6-login/README.md      |   2 +-
 .../modules/scripts/spack-setup/README.md     |   2 +-
 docs/gpu-support.md                           |  23 +-
 docs/slurm-troubleshooting.md                 |  63 +-
 docs/vm-images.md                             |  30 +-
 examples/hpc-enterprise-slurm.yaml            |   2 +-
 modules/README.md                             |  22 +-
 pkg/modulereader/metadata_legacy.go           |  16 -
 tools/cloud-build/project-cleanup-slurm.yaml  |   5 +-
 tools/duplicate-diff.py                       |  13 +-
 .../configs/versioned_blueprint.yaml          |   2 +-
 tools/validate_configs/validate_configs.sh    |   8 +
 58 files changed, 47 insertions(+), 5089 deletions(-)
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/metadata.yaml
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/outputs.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf
 delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/versions.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf

diff --git a/cmd/create.go b/cmd/create.go
index 3ea151cdcd..0b18714fe0 100644
--- a/cmd/create.go
+++ b/cmd/create.go
@@ -125,27 +125,10 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) {
 	// Expand the blueprint
 	checkErr(bp.Expand(), ctx)
 	validateMaybeDie(bp, *ctx)
-	v5DeprecationWarning(bp)
 
 	return bp, ctx
 }
 
-// TODO: Remove this warning when v5 deprecation is complete
-func v5DeprecationWarning(bp config.Blueprint) {
-	alreadyContainsV5 := false
-	bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
-		if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
-			logging.Info("%s", boldYellow(
-				"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n"+
-					"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n"+
-					"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n"+
-					"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
-			))
-			alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
-		}
-	})
-}
-
 // TODO: move to expand.go
 func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
 	err := validators.Execute(bp)
diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh
index dd92f7641f..35e5077653 100644
--- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh
+++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh
@@ -76,7 +76,6 @@ EOL
 
 dnf install -y grafana
 
-# Packages for https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute
 pip3.8 install google-api-python-client \
 	google-cloud-secret-manager \
 	google.cloud.pubsub \
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
deleted file mode 100644
index bc54d36396..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
+++ /dev/null
@@ -1,174 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-node-group module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module creates a node group data structure intended to be input to the
-[schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/) module.
-
-Node groups allow adding heterogeneous node types to a partition, and hence
-running jobs that mix multiple node characteristics. See the [heterogeneous jobs
-section][hetjobs] of the SchedMD documentation for more information.
-
-To specify nodes from a specific node group in a partition, the [`--nodelist`]
-(or `-w`) flag can be used, for example:
-
-```bash
-srun -N 3 -p compute --nodelist cluster-compute-group-[0-2] hostname
-```
-
-Where the 3 nodes will be selected from the nodes `cluster-compute-group-[0-2]`
-in the compute partition.
-
-Additionally, depending on how the nodes differ, a constraint can be added via
-the [`--constraint`] (or `-C`) flag or other flags such as `--mincpus` can be
-used to specify nodes with the desired characteristics.
-
-[`--nodelist`]: https://slurm.schedmd.com/srun.html#OPT_nodelist
-[`--constraint`]: https://slurm.schedmd.com/srun.html#OPT_constraint
-[hetjobs]: https://slurm.schedmd.com/heterogeneous_jobs.html
-
-### Example
-
-The following code snippet creates a partition module using the `node-group`
-module as input with:
-
-* a max node count of 200
-* VM machine type of `c2-standard-30`
-* partition name of "compute"
-* default group name of "ghpc"
-* connected to the `network1` module via `use`
-* nodes mounted to homefs via `use`
-
-```yaml
-- id: node_group
-  source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-  settings:
-    node_count_dynamic_max: 200
-    machine_type: c2-standard-30
-
-- id: compute_partition
-  source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-  use:
-  - network1
-  - homefs
-  - node_group
-  settings:
-    partition_name: compute
-```
-
-## Custom Images
-
-For more information on creating valid custom images for the node group VM
-instances or for custom instance templates, see our [vm-images.md] documentation
-page.
-
-[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images
-
-## GPU Support
-
-More information on GPU support in Slurm on GCP and other Cluster Toolkit modules
-can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md)
-
-## Support
-The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
-modules. For support with the underlying modules, see the instructions in the
-[slurm-gcp README][slurm-gcp-readme].
-
-[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform
-
-## License
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2023 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.1 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | >= 5.11 |
-
-## Providers
-
-| Name | Version |
-|------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | >= 5.11 |
-
-## Modules
-
-No modules.
-
-## Resources
-
-| Name | Type |
-|------|------|
-| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
-| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source |
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_access_config"></a> [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the node group instances can be accessed via the internet. | <pre>list(object({<br/>    nat_ip       = string<br/>    network_tier = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_additional_disks"></a> [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. | <pre>list(object({<br/>    disk_name    = string<br/>    device_name  = string<br/>    disk_size_gb = number<br/>    disk_type    = string<br/>    disk_labels  = map(string)<br/>    auto_delete  = bool<br/>    boot         = bool<br/>  }))</pre> | `[]` | no |
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
-| <a name="input_allow_automatic_updates"></a> [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances.  This feature is<br/>only available on supported images (or images derived from them).  For more details, see<br/>https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no |
-| <a name="input_bandwidth_tier"></a> [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.<br/>  - Setting `platform_default` respects the Google Cloud Platform API default values for networking.<br/>  - Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.<br/>  - Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).<br/>  - Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.<br/>  - Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.<br/>  - See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no |
-| <a name="input_can_ip_forward"></a> [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no |
-| <a name="input_disable_public_ips"></a> [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no |
-| <a name="input_disk_auto_delete"></a> [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no |
-| <a name="input_disk_labels"></a> [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no |
-| <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no |
-| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no |
-| <a name="input_enable_confidential_vm"></a> [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_enable_oslogin"></a> [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.<br/>See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no |
-| <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_enable_smt"></a> [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no |
-| <a name="input_enable_spot_vm"></a> [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no |
-| <a name="input_gpu"></a> [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator | <pre>object({<br/>    type  = string<br/>    count = number<br/>  })</pre> | `null` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-5-12-hpc-centos-7",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
-| <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
-| <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition<br/>variables such as machine\_type and instance\_image will be ignored in favor<br/>of the provided instance template.<br/><br/>For more information on creating custom images for the instance template<br/>that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section<br/>in docs/vm-images.md. | `string` | `null` | no |
-| <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no |
-| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no |
-| <a name="input_maintenance_interval"></a> [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be "PERIODIC" or empty string to not use this feature. | `string` | `""` | no |
-| <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no |
-| <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no |
-| <a name="input_name"></a> [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no |
-| <a name="input_node_conf"></a> [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no |
-| <a name="input_node_count_dynamic_max"></a> [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `10` | no |
-| <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no |
-| <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.<br/><br/>Note: Placement groups are not supported when on\_host\_maintenance is set to<br/>"MIGRATE" and will be deactivated regardless of the value of<br/>enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is<br/>set to "TERMINATE". | `string` | `"TERMINATE"` | no |
-| <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes |
-| <a name="input_reservation_name"></a> [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources<br/>- Must be a "SPECIFIC" reservation<br/>- Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the<br/>default compute service account for the given project will be used with the<br/>"https://www.googleapis.com/auth/cloud-platform" scope. | <pre>object({<br/>    email  = string<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
-| <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless<br/>enable\_shielded\_vm is 'true'.<br/>- enable\_integrity\_monitoring : Compare the most recent boot measurements to the<br/>  integrity policy baseline and return a pair of pass/fail results depending on<br/>  whether they match or not.<br/>- enable\_secure\_boot : Verify the digital signature of all boot components, and<br/>  halt the boot process if signature verification fails.<br/>- enable\_vtpm : Use a virtualized trusted platform module, which is a<br/>  specialized computer chip you can use to encrypt objects like keys and<br/>  certificates. | <pre>object({<br/>    enable_integrity_monitoring = bool<br/>    enable_secure_boot          = bool<br/>    enable_vtpm                 = bool<br/>  })</pre> | <pre>{<br/>  "enable_integrity_monitoring": true,<br/>  "enable_secure_boot": true,<br/>  "enable_vtpm": true<br/>}</pre> | no |
-| <a name="input_source_image"></a> [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_family"></a> [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_project"></a> [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_spot_instance_config"></a> [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. | <pre>object({<br/>    termination_action = string<br/>  })</pre> | `null` | no |
-| <a name="input_tags"></a> [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no |
-
-## Outputs
-
-| Name | Description |
-|------|-------------|
-| <a name="output_node_groups"></a> [node\_groups](#output\_node\_groups) | Details of the node group. Typically used as input to `schedmd-slurm-gcp-v5-partition`. |
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf
deleted file mode 100644
index ae8b93e4d3..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # This label allows for billing report tracking based on module.
-  labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-node-group", ghpc_role = "compute" })
-}
-
-locals {
-  disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
-
-  metadata = merge(
-    local.disable_automatic_updates_metadata,
-    var.metadata
-  )
-
-  enable_public_ip_access_config = var.disable_public_ips ? [] : [{ nat_ip = null, network_tier = null }]
-  access_config                  = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config
-
-  additional_disks = [
-    for ad in var.additional_disks : {
-      disk_name    = ad.disk_name
-      device_name  = ad.device_name
-      disk_type    = ad.disk_type
-      disk_size_gb = ad.disk_size_gb
-      disk_labels  = merge(ad.disk_labels, local.labels)
-      auto_delete  = ad.auto_delete
-      boot         = ad.boot
-    }
-  ]
-
-  node_group = {
-    # Group Definition
-    group_name             = var.name
-    node_count_dynamic_max = var.node_count_dynamic_max
-    node_count_static      = var.node_count_static
-    node_conf              = var.node_conf
-
-    # Template By Definition
-    additional_disks         = local.additional_disks
-    additional_networks      = var.additional_networks
-    bandwidth_tier           = var.bandwidth_tier
-    can_ip_forward           = var.can_ip_forward
-    disable_smt              = !var.enable_smt
-    disk_auto_delete         = var.disk_auto_delete
-    disk_labels              = merge(local.labels, var.disk_labels)
-    disk_size_gb             = var.disk_size_gb
-    disk_type                = var.disk_type
-    enable_confidential_vm   = var.enable_confidential_vm
-    enable_oslogin           = var.enable_oslogin
-    enable_shielded_vm       = var.enable_shielded_vm
-    gpu                      = one(local.guest_accelerator)
-    labels                   = local.labels
-    machine_type             = var.machine_type
-    maintenance_interval     = var.maintenance_interval
-    metadata                 = local.metadata
-    min_cpu_platform         = var.min_cpu_platform
-    on_host_maintenance      = var.on_host_maintenance
-    preemptible              = var.preemptible
-    reservation_name         = var.reservation_name
-    shielded_instance_config = var.shielded_instance_config
-    source_image_family      = local.source_image_family             # requires source_image_logic.tf
-    source_image_project     = local.source_image_project_normalized # requires source_image_logic.tf
-    source_image             = local.source_image                    # requires source_image_logic.tf
-    tags                     = var.tags
-    access_config            = local.access_config
-    service_account = var.service_account != null ? var.service_account : {
-      email  = data.google_compute_default_service_account.default.email
-      scopes = ["https://www.googleapis.com/auth/cloud-platform"]
-    }
-
-    # Spot VM settings
-    enable_spot_vm       = var.enable_spot_vm
-    spot_instance_config = var.spot_instance_config
-
-    # Template By Source
-    instance_template = var.instance_template
-  }
-}
-
-data "google_compute_default_service_account" "default" {
-  project = var.project_id
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml
deleted file mode 100644
index 641832182d..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services: []
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf
deleted file mode 100644
index d289ee3554..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-output "node_groups" {
-  description = "Details of the node group. Typically used as input to `schedmd-slurm-gcp-v5-partition`."
-  value       = local.node_group
-
-  precondition {
-    condition = !contains([
-      "c3-:pd-standard",
-      "h3-:pd-standard",
-      "h3-:pd-ssd",
-    ], "${substr(var.machine_type, 0, 3)}:${var.disk_type}")
-    error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}."
-  }
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf
deleted file mode 100644
index 1df327a60b..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # Currently supported images and projects
-  known_project_families = {
-    schedmd-slurm-public = [
-      "slurm-gcp-5-12-debian-11",
-      "slurm-gcp-5-12-hpc-rocky-linux-8",
-      "slurm-gcp-5-12-ubuntu-2004-lts",
-      "slurm-gcp-5-12-ubuntu-2204-lts-arm64",
-      "slurm-gcp-5-12-hpc-centos-7"
-    ]
-  }
-
-  # This approach to "hacking" the project name allows a chain of Terraform
-  # calls to set the instance source_image (boot disk) with a "relative
-  # resource name" that passes muster with VPC Service Control rules
-  #
-  # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28
-  # https://cloud.google.com/apis/design/resource_names#relative_resource_name
-  source_image_project_normalized = (can(var.instance_image.family) ?
-    "projects/${data.google_compute_image.slurm.project}/global/images/family" :
-    "projects/${data.google_compute_image.slurm.project}/global/images"
-  )
-  source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : ""
-  source_image        = can(var.instance_image.name) ? data.google_compute_image.slurm.name : ""
-}
-
-data "google_compute_image" "slurm" {
-  family  = try(var.instance_image.family, null)
-  name    = try(var.instance_image.name, null)
-  project = var.instance_image.project
-
-  lifecycle {
-    precondition {
-      condition     = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0
-      error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID."
-    }
-
-    postcondition {
-      condition     = var.instance_image_custom || contains(keys(local.known_project_families), self.project)
-      error_message = <<-EOD
-      Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image.
-      EOD
-    }
-    postcondition {
-      condition     = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false)
-      error_message = <<-EOD
-      Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases:
-      ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])}
-      EOD
-    }
-    postcondition {
-      condition     = var.disk_size_gb >= self.disk_size_gb
-      error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size"
-    }
-    postcondition {
-      # Condition needs to check the suffix of the license, as prefix contains an API version which can change.
-      # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates
-      condition     = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")])
-      error_message = "Disabling automatic updates is not supported with the selected VM image.  More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates"
-    }
-  }
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf
deleted file mode 100644
index 86b9f8d021..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf
+++ /dev/null
@@ -1,442 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-# Most variables have been sourced and modified from the SchedMD/slurm-gcp
-# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-
-variable "project_id" {
-  description = "Project in which the HPC deployment will be created."
-  type        = string
-}
-
-## Node Group Definition
-
-variable "name" {
-  description = "Name of the node group."
-  type        = string
-  default     = "ghpc"
-
-  validation {
-    condition     = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name))
-    error_message = "Node group name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'."
-  }
-}
-
-variable "node_conf" {
-  description = "Map of Slurm node line configuration."
-  type        = map(any)
-  default     = {}
-}
-
-variable "node_count_dynamic_max" {
-  description = "Maximum number of auto-scaling nodes allowed in this partition."
-  type        = number
-  default     = 10
-}
-
-variable "node_count_static" {
-  description = "Number of nodes to be statically created."
-  type        = number
-  default     = 0
-}
-
-## VM Definition
-
-variable "instance_template" {
-  description = <<-EOD
-    Self link to a custom instance template. If set, other VM definition
-    variables such as machine_type and instance_image will be ignored in favor
-    of the provided instance template.
-
-    For more information on creating custom images for the instance template
-    that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
-    in docs/vm-images.md.
-    EOD
-  type        = string
-  default     = null
-}
-
-variable "machine_type" {
-  description = "Compute Platform machine type to use for this partition compute nodes."
-  type        = string
-  default     = "c2-standard-60"
-}
-
-variable "metadata" {
-  type        = map(string)
-  description = "Metadata, provided as a map."
-  default     = {}
-}
-
-variable "instance_image" {
-  description = <<-EOD
-    Defines the image that will be used in the Slurm node group VM instances.
-
-    Expected Fields:
-    name: The name of the image. Mutually exclusive with family.
-    family: The image family to use. Mutually exclusive with name.
-    project: The project where the image is hosted.
-
-    For more information on creating custom images that comply with Slurm on GCP
-    see the "Slurm on GCP Custom Images" section in docs/vm-images.md.
-    EOD
-  type        = map(string)
-  default = {
-    project = "schedmd-slurm-public"
-    family  = "slurm-gcp-5-12-hpc-centos-7"
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.project))
-    error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID."
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family))
-    error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name."
-  }
-}
-
-variable "instance_image_custom" {
-  description = <<-EOD
-    A flag that designates that the user is aware that they are requesting
-    to use a custom and potentially incompatible image for this Slurm on
-    GCP module.
-
-    If the field is set to false, only the compatible families and project
-    names will be accepted.  The deployment will fail with any other image
-    family or name.  If set to true, no checks will be done.
-
-    See: https://goo.gle/hpc-slurm-images
-    EOD
-  type        = bool
-  default     = false
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_project" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_project == null
-    error_message = "Variable `source_image_project` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_family" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_family == null
-    error_message = "Variable `source_image_family` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image == null
-    error_message = "Variable `source_image` is deprecated. Use `instance_image` instead."
-  }
-}
-
-variable "tags" {
-  type        = list(string)
-  description = "Network tag list."
-  default     = []
-}
-
-variable "disk_type" {
-  description = "Boot disk type."
-  type        = string
-  default     = "pd-standard"
-}
-
-variable "disk_size_gb" {
-  description = "Size of boot disk to create for the partition compute nodes."
-  type        = number
-  default     = 50
-}
-
-variable "disk_auto_delete" {
-  type        = bool
-  description = "Whether or not the boot disk should be auto-deleted."
-  default     = true
-}
-
-variable "disk_labels" {
-  description = "Labels specific to the boot disk. These will be merged with var.labels."
-  type        = map(string)
-  default     = {}
-}
-
-variable "additional_disks" {
-  description = "Configurations of additional disks to be included on the partition nodes."
-  type = list(object({
-    disk_name    = string
-    device_name  = string
-    disk_size_gb = number
-    disk_type    = string
-    disk_labels  = map(string)
-    auto_delete  = bool
-    boot         = bool
-  }))
-  default = []
-}
-
-variable "enable_confidential_vm" {
-  type        = bool
-  description = "Enable the Confidential VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "enable_shielded_vm" {
-  type        = bool
-  description = "Enable the Shielded VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "enable_oslogin" {
-  type        = bool
-  description = <<-EOD
-    Enables Google Cloud os-login for user login and authentication for VMs.
-    See https://cloud.google.com/compute/docs/oslogin
-    EOD
-  default     = true
-}
-
-variable "can_ip_forward" {
-  description = "Enable IP forwarding, for NAT instances for example."
-  type        = bool
-  default     = false
-}
-
-variable "enable_smt" {
-  type        = bool
-  description = "Enables Simultaneous Multi-Threading (SMT) on instance."
-  default     = false
-}
-
-variable "labels" {
-  description = "Labels to add to partition compute instances. Key-value pairs."
-  type        = map(string)
-  default     = {}
-}
-
-variable "min_cpu_platform" {
-  description = "The name of the minimum CPU platform that you want the instance to use."
-  type        = string
-  default     = null
-}
-
-variable "on_host_maintenance" {
-  type        = string
-  description = <<-EOD
-    Instance availability Policy.
-
-    Note: Placement groups are not supported when on_host_maintenance is set to
-    "MIGRATE" and will be deactivated regardless of the value of
-    enable_placement. To support enable_placement, ensure on_host_maintenance is
-    set to "TERMINATE".
-    EOD
-  default     = "TERMINATE"
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "gpu" {
-  type = object({
-    type  = string
-    count = number
-  })
-  description = "DEPRECATED: use var.guest_accelerator"
-  default     = null
-  validation {
-    condition     = var.gpu == null
-    error_message = "var.gpu is deprecated. Use var.guest_accelerator."
-  }
-}
-
-variable "guest_accelerator" {
-  description = "List of the type and count of accelerator cards attached to the instance."
-  type = list(object({
-    type  = string,
-    count = number
-  }))
-  default  = []
-  nullable = false
-
-  validation {
-    condition     = length(var.guest_accelerator) <= 1
-    error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node."
-  }
-}
-
-variable "preemptible" {
-  description = "Should use preemptibles to burst."
-  type        = bool
-  default     = false
-}
-
-variable "reservation_name" {
-  description = <<-EOD
-    Name of the reservation to use for VM resources
-    - Must be a "SPECIFIC" reservation
-    - Set to empty string if using no reservation or automatically-consumed reservations
-  EOD
-  type        = string
-  default     = ""
-  nullable    = false
-}
-
-variable "service_account" {
-  type = object({
-    email  = string
-    scopes = set(string)
-  })
-  description = <<-EOD
-    Service account to attach to the compute instances. If not set, the
-    default compute service account for the given project will be used with the
-    "https://www.googleapis.com/auth/cloud-platform" scope.
-    EOD
-  default     = null
-}
-
-variable "shielded_instance_config" {
-  type = object({
-    enable_integrity_monitoring = bool
-    enable_secure_boot          = bool
-    enable_vtpm                 = bool
-  })
-  description = <<-EOD
-    Shielded VM configuration for the instance. Note: not used unless
-    enable_shielded_vm is 'true'.
-    - enable_integrity_monitoring : Compare the most recent boot measurements to the
-      integrity policy baseline and return a pair of pass/fail results depending on
-      whether they match or not.
-    - enable_secure_boot : Verify the digital signature of all boot components, and
-      halt the boot process if signature verification fails.
-    - enable_vtpm : Use a virtualized trusted platform module, which is a
-      specialized computer chip you can use to encrypt objects like keys and
-      certificates.
-    EOD
-  default = {
-    enable_integrity_monitoring = true
-    enable_secure_boot          = true
-    enable_vtpm                 = true
-  }
-}
-
-variable "enable_spot_vm" {
-  description = "Enable the partition to use spot VMs (https://cloud.google.com/spot-vms)."
-  type        = bool
-  default     = false
-}
-
-variable "spot_instance_config" {
-  description = "Configuration for spot VMs."
-  type = object({
-    termination_action = string
-  })
-  default = null
-}
-
-variable "bandwidth_tier" {
-  description = <<EOT
-  Configures the network interface card and the maximum egress bandwidth for VMs.
-  - Setting `platform_default` respects the Google Cloud Platform API default values for networking.
-  - Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
-  - Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
-  - Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
-  - Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
-  - See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details.
-  EOT
-  type        = string
-  default     = "platform_default"
-
-  validation {
-    condition     = contains(["platform_default", "virtio_enabled", "gvnic_enabled", "tier_1_enabled"], var.bandwidth_tier)
-    error_message = "Allowed values for bandwidth_tier are 'platform_default', 'virtio_enabled', 'gvnic_enabled', or 'tier_1_enabled'."
-  }
-}
-
-variable "access_config" {
-  description = "Access configurations, i.e. IPs via which the node group instances can be accessed via the internet."
-  type = list(object({
-    nat_ip       = string
-    network_tier = string
-  }))
-  default = []
-}
-
-variable "additional_networks" {
-  description = "Additional network interface details for GCE, if any."
-  default     = []
-  type = list(object({
-    network            = string
-    subnetwork         = string
-    subnetwork_project = string
-    network_ip         = string
-    nic_type           = string
-    stack_type         = string
-    queue_count        = number
-    access_config = list(object({
-      nat_ip       = string
-      network_tier = string
-    }))
-    ipv6_access_config = list(object({
-      network_tier = string
-    }))
-    alias_ip_range = list(object({
-      ip_cidr_range         = string
-      subnetwork_range_name = string
-    }))
-  }))
-}
-
-variable "maintenance_interval" {
-  description = "Specifies the frequency of planned maintenance events. Must be \"PERIODIC\" or empty string to not use this feature."
-  default     = ""
-  type        = string
-  nullable    = false
-
-  validation {
-    condition     = contains(["", "PERIODIC"], var.maintenance_interval)
-    error_message = "var.maintenance_interval must be the empty string or \"PERIODIC\""
-  }
-}
-
-variable "disable_public_ips" {
-  description = "If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access_config is set."
-  type        = bool
-  default     = true
-}
-
-variable "allow_automatic_updates" {
-  description = <<-EOT
-  If false, disables automatic system package updates on the created instances.  This feature is
-  only available on supported images (or images derived from them).  For more details, see
-  https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates
-  EOT
-  type        = bool
-  default     = true
-  nullable    = false
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
deleted file mode 100644
index dd24831a3b..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-terraform {
-  required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = ">= 5.11"
-    }
-  }
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.44.0"
-  }
-  required_version = ">= 1.1"
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
deleted file mode 100644
index 36cd855d67..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
+++ /dev/null
@@ -1,104 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-partition-dynamic module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module creates a dynamic compute partition that can be used as input to the
-[schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md).
-This will configure the slurm partition to contain nodes with the corresponding feature.
-This supports externally created nodes that register as a dynamic node to also be placed
-into their corresponding partition based on node feature.
-
-> **Warning**: updating a partition and running `terraform apply` will not cause
-> the slurm controller to update its own configurations (`slurm.conf`) unless
-> `enable_reconfigure` is set to true in the partition and controller modules.
-
-## Example
-
-The following example creates a dynamic partition, which is then used by a slurm
-controller. This partition will register nodes that have the partition feature
-of "dyn".
-
-```yaml
-  - id: dynamic_partition
-    source: community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic
-    use: [network1]
-    settings:
-      partition_name: dynamic
-      partition_feature: dyn
-
-  - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-    use: [network1, dynamic_partition]
-```
-
-## Support
-
-The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
-modules. For support with the underlying modules, see the instructions in the
-[slurm-gcp README][slurm-gcp-readme].
-
-[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform
-
-## License
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2022 Google LLC
-Copyright (C) SchedMD LLC.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13.0 |
-
-## Providers
-
-No providers.
-
-## Modules
-
-| Name | Source | Version |
-|------|--------|---------|
-| <a name="module_slurm_partition"></a> [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 |
-
-## Resources
-
-No resources.
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes |
-| <a name="input_exclusive"></a> [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no |
-| <a name="input_is_default"></a> [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.<br/>If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no |
-| <a name="input_partition_conf"></a> [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.<br/>See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no |
-| <a name="input_partition_feature"></a> [partition\_feature](#input\_partition\_feature) | Any nodes with this feature will automatically be put into this partition.<br/><br/>NOTE: meant to be used for external dynamic nodes that register. | `string` | n/a | yes |
-| <a name="input_partition_name"></a> [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes |
-| <a name="input_region"></a> [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no |
-| <a name="input_subnetwork_project"></a> [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no |
-| <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no |
-
-## Outputs
-
-| Name | Description |
-|------|-------------|
-| <a name="output_partition"></a> [partition](#output\_partition) | Details of a slurm partition |
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf
deleted file mode 100644
index 38fd95b761..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- * Copyright (C) SchedMD LLC.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # Default to value in partition_conf if both set the same key
-  partition_conf = merge({
-    "Default"     = var.is_default ? "YES" : null,
-    "SuspendTime" = "INFINITE"
-  }, var.partition_conf)
-
-  # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning
-  # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string
-  tmp_cluster_name   = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10)
-  slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name
-}
-
-module "slurm_partition" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2"
-
-  slurm_cluster_name   = local.slurm_cluster_name
-  enable_job_exclusive = var.exclusive
-  partition_conf       = local.partition_conf
-  partition_feature    = var.partition_feature
-  partition_name       = var.partition_name
-  partition_nodes      = []
-  project_id           = var.project_id
-  # region, subnetwork, and subnetwork_project do nothing in this configuration
-  # but are currently required by the module
-  region             = var.region
-  subnetwork         = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link
-  subnetwork_project = var.subnetwork_project
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml
deleted file mode 100644
index 641832182d..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services: []
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf
deleted file mode 100644
index e000aa2a1a..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-output "partition" {
-  description = "Details of a slurm partition"
-  value = {
-    compute_list = module.slurm_partition.compute_list
-    partition    = module.slurm_partition.partition
-  }
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf
deleted file mode 100644
index 653862e030..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-# Most variables have been sourced and modified from the SchedMD/slurm-gcp
-# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-
-variable "deployment_name" {
-  description = "Name of the deployment."
-  type        = string
-}
-
-variable "slurm_cluster_name" {
-  type        = string
-  description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)."
-  default     = null
-}
-
-variable "project_id" {
-  description = "Project in which the HPC deployment will be created."
-  type        = string
-}
-
-variable "region" {
-  description = "The default region for Cloud resources."
-  type        = string
-}
-
-variable "partition_name" {
-  description = "The name of the slurm partition."
-  type        = string
-
-  validation {
-    condition     = can(regex("^[a-z](?:[a-z0-9]{0,6})$", var.partition_name))
-    error_message = "Variable 'partition_name' must be composed of only alphanumeric characters, start with a letter and be 7 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,6})$'."
-  }
-}
-
-variable "partition_conf" {
-  description = <<-EOD
-    Slurm partition configuration as a map.
-    See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
-    EOD
-  type        = map(string)
-  default     = {}
-}
-
-variable "is_default" {
-  description = <<-EOD
-    Sets this partition as the default partition by updating the partition_conf.
-    If "Default" is already set in partition_conf, this variable will have no effect.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "subnetwork_self_link" {
-  type        = string
-  description = "Subnet to deploy to."
-  default     = null
-}
-
-variable "subnetwork_project" {
-  description = "The project the subnetwork belongs to."
-  type        = string
-  default     = null
-}
-
-variable "exclusive" {
-  description = "Exclusive job access to nodes."
-  type        = bool
-  default     = true
-}
-
-variable "partition_feature" {
-  description = <<-EOD
-    Any nodes with this feature will automatically be put into this partition.
-
-    NOTE: meant to be used for external dynamic nodes that register.
-  EOD
-  type        = string
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf
deleted file mode 100644
index 1b471a522a..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf
+++ /dev/null
@@ -1,19 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-terraform {
-  required_version = ">= 0.13.0"
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
deleted file mode 100644
index 552f50c50e..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
+++ /dev/null
@@ -1,192 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-partition module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module creates a compute partition that can be used as input to the
-[schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md).
-
-The partition module is designed to work alongside the
-[schedmd-slurm-gcp-v5-node-group](../schedmd-slurm-gcp-v5-node-group/README.md)
-module. A partition can be made up of one or
-more node groups, provided either through `use` (preferred) or defined manually
-in the `node_groups` variable.
-
-> **Warning**: updating a partition and running `terraform apply` will not cause
-> the slurm controller to update its own configurations (`slurm.conf`) unless
-> `enable_reconfigure` is set to true in the partition and controller modules.
-
-### Example
-
-The following code snippet creates a partition module with:
-
-* 2 node groups added via `use`.
-  * The first node group is made up of machines of type `c2-standard-30`.
-  * The second node group is made up of machines of type `c2-standard-60`.
-  * Both node groups have a maximum count of 200 dynamically created nodes.
-* partition name of "compute".
-* connected to the `network1` module via `use`.
-* nodes mounted to homefs via `use`.
-
-```yaml
-- id: node_group_1
-  source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-  settings:
-    name: c30
-    node_count_dynamic_max: 200
-    machine_type: c2-standard-30
-
-- id: node_group_2
-  source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
-  settings:
-    name: c60
-    node_count_dynamic_max: 200
-    machine_type: c2-standard-60
-
-- id: compute_partition
-  source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-  use:
-  - network1
-  - homefs
-  - node_group_1
-  - node_group_2
-  settings:
-    partition_name: compute
-```
-
-For a complete example using this module, see
-[slurm-gcp-v5-cluster.yaml](../../../examples/slurm-gcp-v5-cluster.yaml).
-
-### Compute VM Zone Policies
-
-The Slurm on GCP partition module allows you to specify additional zones in
-which to create VMs through [bulk creation][bulk]. This is valuable when
-configuring partitions with popular VM families and you desire access to
-more compute resources across zones.
-
-[bulk]: https://cloud.google.com/compute/docs/instances/multiple/about-bulk-creation
-[networkpricing]: https://cloud.google.com/vpc/network-pricing
-
-> **_WARNING:_** Lenient zone policies can lead to additional egress costs when
-> moving large amounts of data between zones in the same region. For example,
-> traffic between VMs and traffic from VMs to shared filesystems such as
-> Filestore. For more information on egress fees, see the
-> [Network Pricing][networkpricing] Google Cloud documentation.
->
-> To avoid egress charges, ensure your compute nodes are created in a single
-> zone by setting var.zone and leaving var.zones to its default value of the
-> empty list.
->
-> **_NOTE:_** If a new zone is added to the region while the cluster is active,
-> nodes in the partition may be created in that zone. In this case, the
-> partition may need to be redeployed (possible via `enable_reconfigure` if set)
-> to ensure the newly added zone is denied.
-
-In the zonal example below, the partition's zone implicitly defaults to the
-deployment variable `vars.zone`:
-
-```yaml
-vars:
-  zone: us-central1-f
-
-- id: zonal-partition
-  source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-```
-
-In the example below, we enable creation in additional zones:
-
-```yaml
-vars:
-  zone: us-central1-f
-
-- id: multi-zonal-partition
-  source: community/modules/compute/schedmd-slurm-gcp-v5-partition
-  settings:
-    zones:
-    - us-central1-a
-    - us-central1-b
-```
-
-## Support
-
-The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
-modules. For support with the underlying modules, see the instructions in the
-[slurm-gcp README][slurm-gcp-readme].
-
-[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform
-
-## License
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2022 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13.0 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | >= 5.11 |
-
-## Providers
-
-| Name | Version |
-|------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | >= 5.11 |
-
-## Modules
-
-| Name | Source | Version |
-|------|--------|---------|
-| <a name="module_slurm_partition"></a> [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 |
-
-## Resources
-
-| Name | Type |
-|------|------|
-| [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source |
-| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source |
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes |
-| <a name="input_enable_placement"></a> [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no |
-| <a name="input_enable_reconfigure"></a> [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.<br/>slurm.conf.tpl, partition details). Compute instances and resource policies<br/>(e.g. placement groups) will be destroyed to align with new configuration.<br/><br/>NOTE: Requires Python and Google Pub/Sub API.<br/><br/>*WARNING*: Toggling this will impact the running workload. Deployed compute nodes<br/>will be destroyed and their jobs will be requeued. | `bool` | `false` | no |
-| <a name="input_exclusive"></a> [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no |
-| <a name="input_is_default"></a> [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.<br/>If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no |
-| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_node_groups"></a> [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See<br/>schedmd-slurm-gcp-v5-node-group for more information on defining a node<br/>group in a blueprint. | <pre>list(object({<br/>    node_count_static      = number<br/>    node_count_dynamic_max = number<br/>    group_name             = string<br/>    node_conf              = map(string)<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    additional_disks = list(object({<br/>      disk_name    = string<br/>      device_name  = string<br/>      disk_size_gb = number<br/>      disk_type    = string<br/>      disk_labels  = map(string)<br/>      auto_delete  = bool<br/>      boot         = bool<br/>    }))<br/>    additional_networks = list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    }))<br/>    bandwidth_tier         = string<br/>    can_ip_forward         = bool<br/>    disable_smt            = bool<br/>    disk_auto_delete       = bool<br/>    disk_labels            = map(string)<br/>    disk_size_gb           = number<br/>    disk_type              = string<br/>    enable_confidential_vm = bool<br/>    enable_oslogin         = bool<br/>    enable_shielded_vm     = bool<br/>    enable_spot_vm         = bool<br/>    gpu = object({<br/>      count = number<br/>      type  = string<br/>    })<br/>    instance_template    = string<br/>    labels               = map(string)<br/>    machine_type         = string<br/>    maintenance_interval = string<br/>    metadata             = map(string)<br/>    min_cpu_platform     = string<br/>    on_host_maintenance  = string<br/>    preemptible          = bool<br/>    reservation_name     = string<br/>    service_account = object({<br/>      email  = string<br/>      scopes = list(string)<br/>    })<br/>    shielded_instance_config = object({<br/>      enable_integrity_monitoring = bool<br/>      enable_secure_boot          = bool<br/>      enable_vtpm                 = bool<br/>    })<br/>    spot_instance_config = object({<br/>      termination_action = string<br/>    })<br/>    source_image_family  = string<br/>    source_image_project = string<br/>    source_image         = string<br/>    tags                 = list(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_partition_conf"></a> [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.<br/>See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no |
-| <a name="input_partition_name"></a> [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes |
-| <a name="input_partition_startup_scripts_timeout"></a> [partition\_startup\_scripts\_timeout](#input\_partition\_startup\_scripts\_timeout) | The timeout (seconds) applied to the partition startup script. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes |
-| <a name="input_region"></a> [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no |
-| <a name="input_startup_script"></a> [startup\_script](#input\_startup\_script) | Startup script that will be used by the partition VMs. | `string` | `""` | no |
-| <a name="input_subnetwork_project"></a> [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no |
-| <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no |
-| <a name="input_zone"></a> [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes |
-| <a name="input_zone_target_shape"></a> [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.<br/>ANY<br/>  GCE picks zones for creating VM instances to fulfill the requested number of VMs<br/>  within present resource constraints and to maximize utilization of unused zonal<br/>  reservations.<br/>ANY\_SINGLE\_ZONE (default)<br/>  GCE always selects a single zone for all the VMs, optimizing for resource quotas,<br/>  available reservations and general capacity.<br/>BALANCED<br/>  GCE prioritizes acquisition of resources, scheduling VMs in zones where resources<br/>  are available while distributing VMs as evenly as possible across allowed zones<br/>  to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no |
-| <a name="input_zones"></a> [zones](#input\_zones) | Additional nodes in which to allow creation of partition nodes. Google Cloud<br/>will find zone based on availability, quota and reservations. | `set(string)` | `[]` | no |
-
-## Outputs
-
-| Name | Description |
-|------|-------------|
-| <a name="output_partition"></a> [partition](#output\_partition) | Details of a slurm partition |
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf
deleted file mode 100644
index 2bf5bb7b30..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  ghpc_startup_script = [{
-    filename = "ghpc_startup.sh"
-    content  = var.startup_script
-  }]
-
-  # Default to value in partition_conf if both set "Default"
-  partition_conf = merge(var.is_default == true ? { "Default" : "YES" } : {}, var.partition_conf)
-
-  # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning
-  # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string
-  tmp_cluster_name   = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10)
-  slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name
-
-  all_zones      = toset(concat([var.zone], tolist(var.zones)))
-  excluded_zones = [for z in data.google_compute_zones.available.names : z if !contains(local.all_zones, z)]
-
-  reservation_map = { for x in var.node_groups : x.reservation_name => x if x.reservation_name != "" }
-}
-
-data "google_compute_zones" "available" {
-  project = var.project_id
-  region  = var.region
-}
-
-module "slurm_partition" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2"
-
-  slurm_cluster_name                = local.slurm_cluster_name
-  partition_nodes                   = var.node_groups
-  enable_job_exclusive              = var.exclusive
-  enable_placement_groups           = var.enable_placement
-  enable_reconfigure                = var.enable_reconfigure
-  network_storage                   = var.network_storage
-  partition_name                    = var.partition_name
-  project_id                        = var.project_id
-  region                            = var.region
-  zone_policy_allow                 = [] # this setting is effectively useless because allow is implied default
-  zone_policy_deny                  = local.excluded_zones
-  zone_target_shape                 = var.zone_target_shape
-  subnetwork                        = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link
-  subnetwork_project                = var.subnetwork_project
-  partition_conf                    = local.partition_conf
-  partition_startup_scripts         = local.ghpc_startup_script
-  partition_startup_scripts_timeout = var.partition_startup_scripts_timeout
-}
-
-# tflint-ignore: terraform_unused_declarations
-data "google_compute_reservation" "reservation" {
-  project = var.project_id
-  zone    = var.zone
-
-  for_each = local.reservation_map
-  name     = each.value.reservation_name
-
-  lifecycle {
-    postcondition {
-      condition     = self.self_link != null
-      error_message = "couldn't find the reservation ${each.value.reservation_name}}"
-    }
-
-    postcondition {
-      condition     = coalesce(self.specific_reservation_required, true)
-      error_message = <<EOT
-        your reservation has to be specific,
-        see https://cloud.google.com/compute/docs/instances/reservations-overview#how-reservations-work
-        for more information. if it's intentionally automatic, don't specify
-        it in the blueprint.
-      EOT
-    }
-  }
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v5-partition/metadata.yaml
deleted file mode 100644
index 4c2f23a8d7..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/metadata.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services:
-    - compute.googleapis.com
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/outputs.tf
deleted file mode 100644
index d160ece835..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/outputs.tf
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-output "partition" {
-  description = "Details of a slurm partition"
-  value = {
-    compute_list = module.slurm_partition.compute_list
-    partition    = module.slurm_partition.partition
-  }
-  precondition {
-    condition     = var.enable_placement == false || (var.exclusive == var.enable_placement)
-    error_message = "If var.enable_placement is true, var.exclusive must be as well."
-  }
-  precondition {
-    condition     = var.enable_placement == false || contains(["NO", "Exclusive"], lookup(var.partition_conf, "Oversubscribe", "NO"))
-    error_message = "If var.enable_placement is true, var.partition_conf[\"Oversubscribe\"] should be either undefined, \"NO\", or \"Exclusive\"."
-  }
-  precondition {
-    condition     = var.enable_placement == false || (lookup(var.partition_conf, "SuspendTime", null) == null)
-    error_message = "If var.enable_placement is true, var.partition_conf[\"SuspendTime\"] should be undefined."
-  }
-  precondition {
-    condition     = var.enable_placement == false || alltrue([for v in var.node_groups : v.reservation_name == ""])
-    error_message = <<-EOT
-      If `reservation_name` is specified in at least one node group, `var.enable_placement` must be false.
-      If you want to use placement policies with reservations, the placement policy must be attached to the reservation.
-      EOT
-  }
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf
deleted file mode 100644
index 1ca9b96eaa..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf
+++ /dev/null
@@ -1,270 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-# Most variables have been sourced and modified from the SchedMD/slurm-gcp
-# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-
-variable "deployment_name" {
-  description = "Name of the deployment."
-  type        = string
-}
-
-variable "slurm_cluster_name" {
-  type        = string
-  description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)."
-  default     = null
-}
-
-variable "project_id" {
-  description = "Project in which the HPC deployment will be created."
-  type        = string
-}
-
-variable "region" {
-  description = "The default region for Cloud resources."
-  type        = string
-}
-
-variable "zone" {
-  description = "Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones."
-  type        = string
-}
-
-variable "zones" {
-  description = <<-EOD
-    Additional nodes in which to allow creation of partition nodes. Google Cloud
-    will find zone based on availability, quota and reservations.
-    EOD
-  type        = set(string)
-  default     = []
-
-  validation {
-    condition = alltrue([
-      for x in var.zones : length(regexall("^[a-z]+-[a-z]+[0-9]-[a-z]$", x)) > 0
-    ])
-    error_message = "A value in var.zones is not a valid zone (example: us-central1-f)."
-  }
-}
-
-variable "zone_target_shape" {
-  description = <<EOD
-Strategy for distributing VMs across zones in a region.
-ANY
-  GCE picks zones for creating VM instances to fulfill the requested number of VMs
-  within present resource constraints and to maximize utilization of unused zonal
-  reservations.
-ANY_SINGLE_ZONE (default)
-  GCE always selects a single zone for all the VMs, optimizing for resource quotas,
-  available reservations and general capacity.
-BALANCED
-  GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
-  are available while distributing VMs as evenly as possible across allowed zones
-  to minimize the impact of zonal failure.
-EOD
-  type        = string
-  default     = "ANY_SINGLE_ZONE"
-  validation {
-    condition     = contains(["ANY", "ANY_SINGLE_ZONE", "BALANCED"], var.zone_target_shape)
-    error_message = "Allowed values for zone_target_shape are \"ANY\", \"ANY_SINGLE_ZONE\", or \"BALANCED\"."
-  }
-}
-
-variable "partition_name" {
-  description = "The name of the slurm partition."
-  type        = string
-
-  validation {
-    condition     = can(regex("^[a-z](?:[a-z0-9]{0,6})$", var.partition_name))
-    error_message = "Variable 'partition_name' must be composed of only alphanumeric characters, start with a letter and be 7 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,6})$'."
-  }
-}
-
-variable "partition_conf" {
-  description = <<-EOD
-    Slurm partition configuration as a map.
-    See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
-    EOD
-  type        = map(string)
-  default     = {}
-}
-
-variable "startup_script" {
-  description = "Startup script that will be used by the partition VMs."
-  type        = string
-  default     = ""
-}
-
-variable "partition_startup_scripts_timeout" {
-  description = <<-EOD
-    The timeout (seconds) applied to the partition startup script. If
-    any script exceeds this timeout, then the instance setup process is considered
-    failed and handled accordingly.
-
-    NOTE: When set to 0, the timeout is considered infinite and thus disabled.
-    EOD
-  type        = number
-  default     = 300
-}
-
-variable "is_default" {
-  description = <<-EOD
-    Sets this partition as the default partition by updating the partition_conf.
-    If "Default" is already set in partition_conf, this variable will have no effect.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "subnetwork_self_link" {
-  type        = string
-  description = "Subnet to deploy to."
-  default     = null
-}
-
-variable "subnetwork_project" {
-  description = "The project the subnetwork belongs to."
-  type        = string
-  default     = null
-}
-
-variable "exclusive" {
-  description = "Exclusive job access to nodes."
-  type        = bool
-  default     = true
-}
-
-variable "enable_placement" {
-  description = "Enable placement groups."
-  type        = bool
-  default     = true
-}
-
-variable "enable_reconfigure" {
-  description = <<-EOD
-    Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
-    slurm.conf.tpl, partition details). Compute instances and resource policies
-    (e.g. placement groups) will be destroyed to align with new configuration.
-
-    NOTE: Requires Python and Google Pub/Sub API.
-
-    *WARNING*: Toggling this will impact the running workload. Deployed compute nodes
-    will be destroyed and their jobs will be requeued.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "network_storage" {
-  description = "An array of network attached storage mounts to be configured on the partition compute nodes."
-  type = list(object({
-    server_ip             = string,
-    remote_mount          = string,
-    local_mount           = string,
-    fs_type               = string,
-    mount_options         = string,
-    client_install_runner = map(string)
-    mount_runner          = map(string)
-  }))
-  default = []
-}
-
-variable "node_groups" {
-  description = <<-EOT
-    A list of node groups associated with this partition. See
-    schedmd-slurm-gcp-v5-node-group for more information on defining a node
-    group in a blueprint.
-    EOT
-  type = list(object({
-    node_count_static      = number
-    node_count_dynamic_max = number
-    group_name             = string
-    node_conf              = map(string)
-    access_config = list(object({
-      nat_ip       = string
-      network_tier = string
-    }))
-    additional_disks = list(object({
-      disk_name    = string
-      device_name  = string
-      disk_size_gb = number
-      disk_type    = string
-      disk_labels  = map(string)
-      auto_delete  = bool
-      boot         = bool
-    }))
-    additional_networks = list(object({
-      network            = string
-      subnetwork         = string
-      subnetwork_project = string
-      network_ip         = string
-      nic_type           = string
-      stack_type         = string
-      queue_count        = number
-      access_config = list(object({
-        nat_ip       = string
-        network_tier = string
-      }))
-      ipv6_access_config = list(object({
-        network_tier = string
-      }))
-      alias_ip_range = list(object({
-        ip_cidr_range         = string
-        subnetwork_range_name = string
-      }))
-    }))
-    bandwidth_tier         = string
-    can_ip_forward         = bool
-    disable_smt            = bool
-    disk_auto_delete       = bool
-    disk_labels            = map(string)
-    disk_size_gb           = number
-    disk_type              = string
-    enable_confidential_vm = bool
-    enable_oslogin         = bool
-    enable_shielded_vm     = bool
-    enable_spot_vm         = bool
-    gpu = object({
-      count = number
-      type  = string
-    })
-    instance_template    = string
-    labels               = map(string)
-    machine_type         = string
-    maintenance_interval = string
-    metadata             = map(string)
-    min_cpu_platform     = string
-    on_host_maintenance  = string
-    preemptible          = bool
-    reservation_name     = string
-    service_account = object({
-      email  = string
-      scopes = list(string)
-    })
-    shielded_instance_config = object({
-      enable_integrity_monitoring = bool
-      enable_secure_boot          = bool
-      enable_vtpm                 = bool
-    })
-    spot_instance_config = object({
-      termination_action = string
-    })
-    source_image_family  = string
-    source_image_project = string
-    source_image         = string
-    tags                 = list(string)
-  }))
-  default = []
-}
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
deleted file mode 100644
index 1749023916..0000000000
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-terraform {
-  required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = ">= 5.11"
-    }
-  }
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.44.0"
-  }
-  required_version = ">= 0.13.0"
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
deleted file mode 100644
index fb020afb9f..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
+++ /dev/null
@@ -1,307 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-controller module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module creates a slurm controller node via the [SchedMD/slurm-gcp]
-[slurm\_controller\_instance] and [slurm\_instance\_template] modules.
-
-More information about Slurm On GCP can be found at the
-[project's GitHub page][SchedMD/slurm-gcp] and in the
-[Slurm on Google Cloud User Guide][slurm-ug].
-
-The [user guide][slurm-ug] provides detailed instructions on customizing and
-enhancing the Slurm on GCP cluster as well as recommendations on configuring the
-controller for optimal performance at different scales.
-
-> **Warning**: The variables `enable_reconfigure`,
-> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to
-> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**.
->
-> ```shell
-> # Install Python3 and run
-> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt
-> ```
-
-[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2
-[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_instance
-[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template
-[slurm-ug]: https://goo.gle/slurm-gcp-user-guide.
-[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt
-[enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute
-[enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions
-[enable\_reconfigure]: #input\_enable\_reconfigure
-
-### Example
-
-```yaml
-- id: slurm_controller
-  source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-  use:
-  - network1
-  - homefs
-  - compute_partition
-  settings:
-    machine_type: c2-standard-8
-```
-
-This creates a controller node with the following attributes:
-
-* connected to the primary subnetwork of `network1`
-* the filesystem with the ID `homefs` (defined elsewhere in the blueprint)
-  mounted
-* One partition with the ID `compute_partition` (defined elsewhere in the
-  blueprint)
-* machine type upgraded from the default `c2-standard-4` to `c2-standard-8`
-
-For a complete example using this module, see
-[slurm-gcp-v5-cluster.yaml](../../../examples/slurm-gcp-v5-cluster.yaml).
-
-### Live Cluster Reconfiguration (`enable_reconfigure`)
-
-The schedmd-slurm-gcp-v5-controller module supports the reconfiguration of
-partitions and slurm configuration in a running, active cluster. This option is
-activated through the `enable_reconfigure` setting:
-
-```yaml
-- id: slurm_controller
-  source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-  settings:
-    enable_reconfigure: true
-```
-
-To reconfigure a running cluster:
-
-1. Edit the blueprint with the desired configuration changes
-1. Call `gcluster create <blueprint> -w` to overwrite the deployment directory
-1. Follow instructions in terminal to deploy
-
-The following are examples of updates that can be made to a running cluster:
-
-* Add or remove a partition to the cluster
-* Resize an existing partition
-* Attach new network storage to an existing partition
-
-> **NOTE**: Changing the VM `machine_type` of a partition may not work with
-> `enable_reconfigure`. It is better to create a new partition and delete the
-> old one.
-
-This option has some additional requirements:
-
-* The Pub/Sub API must be activated in the target project:
-  `gcloud services enable pubsub.googleapis.com --project "<<PROJECT_ID>>"`
-* The authenticated user in the local development environment (or where
-  `terraform apply` is called) must have the Pub/Sub Admin (roles/pubsub.admin)
-  IAM role.
-* Python and some python packages need to be installed with pip in the local
-  development environment deploying the cluster. One can use following commands:
-
-  ```bash
-  pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt
-  ```
-
-  For more information, see the [description][optdeps] of this module.
-
-[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster#optional
-
-## Custom Images
-
-For more information on creating valid custom images for the controller VM
-instance or for custom instance templates, see our [vm-images.md] documentation
-page.
-
-[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images
-
-## GPU Support
-
-More information on GPU support in Slurm on GCP and other Cluster Toolkit modules
-can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md)
-
-## Placement Max Distance
-
-When using
-[enable_placement](../../compute/schedmd-slurm-gcp-v5-partition/README.md#input_enable_placement)
-with Slurm, Google Compute Engine will attempt to place VMs as physically close
-together as possible. Capacity constraints at the time of VM creation may still
-force VMs to be spread across multiple racks. Google provides the `max-distance`
-flag which can used to control the maximum spreading allowed. Read more about
-`max-distance` in the
-[official docs](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies
-).
-
-After deploying a Slurm cluster, you can use the following steps to manually
-configure the max-distance parameter.
-
-1. Make sure your blueprint has `enable_placement: true` setting for Slurm
-   partitions.
-2. Deploy the Slurm cluster and wait for the deployment to complete.
-3. SSH to the deployed Slurm controller
-4. Apply the following edit to `/slurm/scripts/config.yaml`:
-
-    ```yaml
-    # Replace
-    enable_slurm_gcp_plugins: false
-
-    # With
-    enable_slurm_gcp_plugins:
-      max_hops:
-        max_hops: 1
-    ```
-
-The `max_hops` parameter will be used for the `max-distance` argument. In the
-above case using a value of 1 will restrict VM to be placed on the same rack.
-
-You can confirm that the `max-distance`` was applied by calling the following
-command while jobs are running:
-
-```shell
-gcloud beta compute resource-policies list \
-  --format='yaml(name,groupPlacementPolicy.maxDistance)'
-```
-
-> [!WARNING]
-> If a zone lacks capacity, using a lower `max-distance` value (such as 1) is
-> more likely to cause VMs creation to fail.
-
-<!---->
-
-> [!WARNING]
-> `/slurm/scripts/config.yaml` will be overwritten if the blueprint is
-> re-deployed using the `enable_reconfigure` flag.
-
-## Hybrid Slurm Clusters
-For more information on how to configure an on premise slurm cluster with hybrid
-cloud partitions, see the [schedmd-slurm-gcp-v5-hybrid] module and our
-extended instructions in our [docs](../../../../docs/hybrid-slurm-cluster/).
-
-[schedmd-slurm-gcp-v5-hybrid]: ../schedmd-slurm-gcp-v5-hybrid/README.md
-
-## Support
-The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
-modules. For support with the underlying modules, see the instructions in the
-[slurm-gcp README][slurm-gcp-readme].
-
-[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform
-
-## License
-
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2023 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.1 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.83 |
-
-## Providers
-
-| Name | Version |
-|------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | >= 3.83 |
-
-## Modules
-
-| Name | Source | Version |
-|------|--------|---------|
-| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.2 |
-| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 |
-
-## Resources
-
-| Name | Type |
-|------|------|
-| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
-| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source |
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_access_config"></a> [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. | <pre>list(object({<br/>    nat_ip       = string<br/>    network_tier = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_additional_disks"></a> [additional\_disks](#input\_additional\_disks) | List of maps of disks. | <pre>list(object({<br/>    disk_name    = string<br/>    device_name  = string<br/>    disk_type    = string<br/>    disk_size_gb = number<br/>    disk_labels  = map(string)<br/>    auto_delete  = bool<br/>    boot         = bool<br/>  }))</pre> | `[]` | no |
-| <a name="input_allow_automatic_updates"></a> [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances.  This feature is<br/>only available on supported images (or images derived from them).  For more details, see<br/>https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no |
-| <a name="input_can_ip_forward"></a> [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no |
-| <a name="input_cgroup_conf_tpl"></a> [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no |
-| <a name="input_cloud_parameters"></a> [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. | <pre>object({<br/>    no_comma_params = bool<br/>    resume_rate     = number<br/>    resume_timeout  = number<br/>    suspend_rate    = number<br/>    suspend_timeout = number<br/>  })</pre> | <pre>{<br/>  "no_comma_params": false,<br/>  "resume_rate": 0,<br/>  "resume_timeout": 300,<br/>  "suspend_rate": 0,<br/>  "suspend_timeout": 300<br/>}</pre> | no |
-| <a name="input_cloudsql"></a> [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.<br/>  server\_ip : Address of the database server.<br/>  user      : The user to access the database as.<br/>  password  : The password, given the user, to access the given database. (sensitive)<br/>  db\_name   : The database to access. | <pre>object({<br/>    server_ip = string<br/>    user      = string<br/>    password  = string # sensitive<br/>    db_name   = string<br/>  })</pre> | `null` | no |
-| <a name="input_compute_startup_script"></a> [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no |
-| <a name="input_compute_startup_scripts_timeout"></a> [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to the compute\_startup\_script. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
-| <a name="input_controller_startup_script"></a> [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no |
-| <a name="input_controller_startup_scripts_timeout"></a> [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to the controller\_startup\_script. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
-| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes |
-| <a name="input_disable_controller_public_ips"></a> [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no |
-| <a name="input_disable_default_mounts"></a> [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller<br/>- /usr/local/etc/slurm<br/>- /etc/munge<br/>- /home<br/>- /apps<br/>Warning: If these are disabled, the slurm etc and munge dirs must be added<br/>manually, or some other mechanism must be used to synchronize the slurm conf<br/>files and the munge key across the cluster. | `bool` | `false` | no |
-| <a name="input_disable_smt"></a> [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no |
-| <a name="input_disk_auto_delete"></a> [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no |
-| <a name="input_disk_labels"></a> [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no |
-| <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no |
-| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-ssd"` | no |
-| <a name="input_enable_bigquery_load"></a> [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no |
-| <a name="input_enable_cleanup_compute"></a> [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.<br/>placement groups) managed by this module, when cluster is destroyed.<br/><br/>NOTE: Requires Python and pip packages listed at the following link:<br/>https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt<br/><br/>*WARNING*: Toggling this may impact the running workload. Deployed compute nodes<br/>may be destroyed and their jobs will be requeued. | `bool` | `false` | no |
-| <a name="input_enable_cleanup_subscriptions"></a> [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when<br/>cluster is destroyed.<br/><br/>NOTE: Requires Python and pip packages listed at the following link:<br/>https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt<br/><br/>*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no |
-| <a name="input_enable_confidential_vm"></a> [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_enable_devel"></a> [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no |
-| <a name="input_enable_external_prolog_epilog"></a> [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts<br/>shared under /opt/apps from the controller to compute nodes. | `bool` | `false` | no |
-| <a name="input_enable_oslogin"></a> [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.<br/>See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no |
-| <a name="input_enable_reconfigure"></a> [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.<br/>slurm.conf.tpl, partition details). Compute instances and resource policies<br/>(e.g. placement groups) will be destroyed to align with new configuration.<br/>NOTE: Requires Python and Google Pub/Sub API.<br/>*WARNING*: Toggling this will impact the running workload. Deployed compute nodes<br/>will be destroyed and their jobs will be requeued. | `bool` | `false` | no |
-| <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_enable_slurm_gcp_plugins"></a> [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no |
-| <a name="input_epilog_scripts"></a> [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute<br/>on every node when a user's job completes.<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. | <pre>list(object({<br/>    filename = string<br/>    content  = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_gpu"></a> [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator | <pre>object({<br/>    type  = string<br/>    count = number<br/>  })</pre> | `null` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-5-12-hpc-centos-7",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
-| <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
-| <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition<br/>variables such as machine\_type and instance\_image will be ignored in favor<br/>of the provided instance template.<br/><br/>For more information on creating custom images for the instance template<br/>that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section<br/>in docs/vm-images.md. | `string` | `null` | no |
-| <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no |
-| <a name="input_login_startup_scripts_timeout"></a> [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to the login startup script. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
-| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no |
-| <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no |
-| <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of<br/>CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:<br/>https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no |
-| <a name="input_network_ip"></a> [network\_ip](#input\_network\_ip) | DEPRECATED: Use `static_ips` variable to assign an internal static ip address. | `string` | `null` | no |
-| <a name="input_network_self_link"></a> [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no |
-| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no |
-| <a name="input_partition"></a> [partition](#input\_partition) | Cluster partitions as a list. | <pre>list(object({<br/>    compute_list = list(string)<br/>    partition = object({<br/>      enable_job_exclusive    = bool<br/>      enable_placement_groups = bool<br/>      network_storage = list(object({<br/>        server_ip     = string<br/>        remote_mount  = string<br/>        local_mount   = string<br/>        fs_type       = string<br/>        mount_options = string<br/>      }))<br/>      partition_conf    = map(string)<br/>      partition_feature = string<br/>      partition_name    = string<br/>      partition_nodes = map(object({<br/>        access_config = list(object({<br/>          network_tier = string<br/>        }))<br/>        bandwidth_tier         = string<br/>        node_count_dynamic_max = number<br/>        node_count_static      = number<br/>        enable_spot_vm         = bool<br/>        group_name             = string<br/>        instance_template      = string<br/>        maintenance_interval   = string<br/>        node_conf              = map(string)<br/>        reservation_name       = string<br/>        spot_instance_config = object({<br/>          termination_action = string<br/>        })<br/>      }))<br/>      partition_startup_scripts_timeout = number<br/>      subnetwork                        = string<br/>      zone_policy_allow                 = list(string)<br/>      zone_policy_deny                  = list(string)<br/>      zone_target_shape                 = string<br/>    })<br/>  }))</pre> | `[]` | no |
-| <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
-| <a name="input_prolog_scripts"></a> [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute<br/>whenever it is asked to run a job step from a new job allocation.<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. | <pre>list(object({<br/>    filename = string<br/>    content  = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_region"></a> [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the<br/>default compute service account for the given project will be used with the<br/>"https://www.googleapis.com/auth/cloud-platform" scope. | <pre>object({<br/>    email  = string<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
-| <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless<br/>enable\_shielded\_vm is 'true'.<br/>  enable\_integrity\_monitoring : Compare the most recent boot measurements to the<br/>  integrity policy baseline and return a pair of pass/fail results depending on<br/>  whether they match or not.<br/>  enable\_secure\_boot : Verify the digital signature of all boot components, and<br/>  halt the boot process if signature verification fails.<br/>  enable\_vtpm : Use a virtualized trusted platform module, which is a<br/>  specialized computer chip you can use to encrypt objects like keys and<br/>  certificates. | <pre>object({<br/>    enable_integrity_monitoring = bool<br/>    enable_secure_boot          = bool<br/>    enable_vtpm                 = bool<br/>  })</pre> | <pre>{<br/>  "enable_integrity_monitoring": true,<br/>  "enable_secure_boot": true,<br/>  "enable_vtpm": true<br/>}</pre> | no |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no |
-| <a name="input_slurm_conf_tpl"></a> [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no |
-| <a name="input_slurmdbd_conf_tpl"></a> [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no |
-| <a name="input_source_image"></a> [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_family"></a> [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_project"></a> [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_static_ips"></a> [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no |
-| <a name="input_subnetwork_project"></a> [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no |
-| <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no |
-| <a name="input_tags"></a> [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no |
-| <a name="input_zone"></a> [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be<br/>spread across available zones in the region. | `string` | `null` | no |
-
-## Outputs
-
-| Name | Description |
-|------|-------------|
-| <a name="output_cloud_logging_filter"></a> [cloud\_logging\_filter](#output\_cloud\_logging\_filter) | Cloud Logging filter to cluster errors. |
-| <a name="output_controller_instance_id"></a> [controller\_instance\_id](#output\_controller\_instance\_id) | The server-assigned unique identifier of the controller compute instance. |
-| <a name="output_pubsub_topic"></a> [pubsub\_topic](#output\_pubsub\_topic) | Cluster Pub/Sub topic. |
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl
deleted file mode 100644
index 8fb3f695e0..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl
+++ /dev/null
@@ -1,67 +0,0 @@
-# slurm.conf
-# https://slurm.schedmd.com/high_throughput.html
-
-ProctrackType=proctrack/cgroup
-SlurmctldPidFile=/var/run/slurm/slurmctld.pid
-SlurmdPidFile=/var/run/slurm/slurmd.pid
-TaskPlugin=task/affinity,task/cgroup
-MaxArraySize=10001
-MaxJobCount=500000
-MaxNodeCount=100000
-MinJobAge=60
-
-#
-#
-# SCHEDULING
-SchedulerType=sched/backfill
-SelectType=select/cons_tres
-SelectTypeParameters=CR_Core_Memory
-
-#
-#
-# LOGGING AND ACCOUNTING
-SlurmctldDebug=error
-SlurmdDebug=error
-
-#
-#
-# TIMERS
-MessageTimeout=60
-
-################################################################################
-#              vvvvv  WARNING: DO NOT MODIFY SECTION BELOW  vvvvv              #
-################################################################################
-
-SlurmctldHost={control_host}({control_addr})
-
-AuthType=auth/munge
-AuthInfo=cred_expire=120
-AuthAltTypes=auth/jwt
-CredType=cred/munge
-MpiDefault={mpi_default}
-ReturnToService=2
-SlurmctldPort={control_host_port}
-SlurmdPort=6818
-SlurmdSpoolDir=/var/spool/slurmd
-SlurmUser=slurm
-StateSaveLocation={state_save}
-
-#
-#
-# LOGGING AND ACCOUNTING
-AccountingStorageType=accounting_storage/slurmdbd
-AccountingStorageHost={control_host}
-ClusterName={name}
-SlurmctldLogFile={slurmlog}/slurmctld.log
-SlurmdLogFile={slurmlog}/slurmd-%n.log
-
-#
-#
-# GENERATED CLOUD CONFIGURATIONS
-include cloud.conf
-
-################################################################################
-#              ^^^^^  WARNING: DO NOT MODIFY SECTION ABOVE  ^^^^^              #
-################################################################################
-
-SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl
deleted file mode 100644
index 9dc4ed9c70..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl
+++ /dev/null
@@ -1,34 +0,0 @@
-# slurmdbd.conf
-# https://slurm.schedmd.com/slurmdbd.conf.html
-
-DebugLevel=info
-PidFile=/var/run/slurm/slurmdbd.pid
-
-# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay
-CommitDelay=1
-
-################################################################################
-#              vvvvv  WARNING: DO NOT MODIFY SECTION BELOW  vvvvv              #
-################################################################################
-
-AuthType=auth/munge
-AuthAltTypes=auth/jwt
-AuthAltParameters=jwt_key={state_save}/jwt_hs256.key
-
-DbdHost={control_host}
-
-LogFile={slurmlog}/slurmdbd.log
-
-SlurmUser=slurm
-
-StorageLoc={db_name}
-
-StorageType=accounting_storage/mysql
-StorageHost={db_host}
-StoragePort={db_port}
-StorageUser={db_user}
-StoragePass={db_pass}
-
-################################################################################
-#              ^^^^^  WARNING: DO NOT MODIFY SECTION ABOVE  ^^^^^              #
-################################################################################
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl
deleted file mode 100644
index 22c7bf4ca7..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl
+++ /dev/null
@@ -1,70 +0,0 @@
-# slurm.conf
-# https://slurm.schedmd.com/slurm.conf.html
-# https://slurm.schedmd.com/configurator.html
-
-ProctrackType=proctrack/cgroup
-SlurmctldPidFile=/var/run/slurm/slurmctld.pid
-SlurmdPidFile=/var/run/slurm/slurmd.pid
-TaskPlugin=task/affinity,task/cgroup
-MaxNodeCount=64000
-
-#
-#
-# SCHEDULING
-SchedulerType=sched/backfill
-SelectType=select/cons_tres
-SelectTypeParameters=CR_Core_Memory
-
-#
-#
-# LOGGING AND ACCOUNTING
-AccountingStoreFlags=job_comment
-JobAcctGatherFrequency=30
-JobAcctGatherType=jobacct_gather/cgroup
-SlurmctldDebug=info
-SlurmdDebug=info
-DebugFlags=Power
-
-#
-#
-# TIMERS
-MessageTimeout=600
-BatchStartTimeout=600
-PrologEpilogTimeout=600
-PrologFlags=Contain
-
-################################################################################
-#              vvvvv  WARNING: DO NOT MODIFY SECTION BELOW  vvvvv              #
-################################################################################
-
-SlurmctldHost={control_host}({control_addr})
-
-AuthType=auth/munge
-AuthInfo=cred_expire=120
-AuthAltTypes=auth/jwt
-CredType=cred/munge
-MpiDefault={mpi_default}
-ReturnToService=2
-SlurmctldPort={control_host_port}
-SlurmdPort=6818
-SlurmdSpoolDir=/var/spool/slurmd
-SlurmUser=slurm
-StateSaveLocation={state_save}
-
-#
-#
-# LOGGING AND ACCOUNTING
-AccountingStorageType=accounting_storage/slurmdbd
-AccountingStorageHost={control_host}
-ClusterName={name}
-SlurmctldLogFile={slurmlog}/slurmctld.log
-SlurmdLogFile={slurmlog}/slurmd-%n.log
-
-#
-#
-# GENERATED CLOUD CONFIGURATIONS
-include cloud.conf
-
-################################################################################
-#              ^^^^^  WARNING: DO NOT MODIFY SECTION ABOVE  ^^^^^              #
-################################################################################
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf
deleted file mode 100644
index fca4d3e203..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf
+++ /dev/null
@@ -1,137 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-locals {
-  # This label allows for billing report tracking based on module.
-  labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-controller", ghpc_role = "scheduler" })
-}
-
-locals {
-  disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
-
-  metadata = merge(
-    local.disable_automatic_updates_metadata,
-    var.metadata
-  )
-
-  ghpc_startup_script_controller = [{
-    filename = "ghpc_startup.sh"
-    content  = var.controller_startup_script
-  }]
-  ghpc_startup_script_compute = [{
-    filename = "ghpc_startup.sh"
-    content  = var.compute_startup_script
-  }]
-  # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning
-  # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string
-  tmp_cluster_name   = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10)
-  slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name
-
-  enable_public_ip_access_config = var.disable_controller_public_ips ? [] : [{ nat_ip = null, network_tier = null }]
-  access_config                  = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config
-
-  additional_disks = [
-    for ad in var.additional_disks : {
-      disk_name    = ad.disk_name
-      device_name  = ad.device_name
-      disk_type    = ad.disk_type
-      disk_size_gb = ad.disk_size_gb
-      disk_labels  = merge(ad.disk_labels, local.labels)
-      auto_delete  = ad.auto_delete
-      boot         = ad.boot
-    }
-  ]
-}
-
-data "google_compute_default_service_account" "default" {
-  project = var.project_id
-}
-
-module "slurm_controller_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.2"
-
-  access_config                      = local.access_config
-  slurm_cluster_name                 = local.slurm_cluster_name
-  instance_template                  = var.instance_template != null ? var.instance_template : module.slurm_controller_template.self_link
-  project_id                         = var.project_id
-  region                             = var.region
-  network                            = var.network_self_link == null ? "" : var.network_self_link
-  subnetwork                         = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link
-  subnetwork_project                 = var.subnetwork_project
-  zone                               = var.zone
-  static_ips                         = var.static_ips
-  cgroup_conf_tpl                    = var.cgroup_conf_tpl
-  cloud_parameters                   = var.cloud_parameters
-  cloudsql                           = var.cloudsql
-  controller_startup_scripts         = local.ghpc_startup_script_controller
-  compute_startup_scripts            = local.ghpc_startup_script_compute
-  controller_startup_scripts_timeout = var.controller_startup_scripts_timeout
-  compute_startup_scripts_timeout    = var.compute_startup_scripts_timeout
-  login_startup_scripts_timeout      = var.login_startup_scripts_timeout
-  enable_devel                       = var.enable_devel
-  enable_cleanup_compute             = var.enable_cleanup_compute
-  enable_cleanup_subscriptions       = var.enable_cleanup_subscriptions
-  enable_external_prolog_epilog      = var.enable_external_prolog_epilog
-  enable_reconfigure                 = var.enable_reconfigure
-  enable_bigquery_load               = var.enable_bigquery_load
-  enable_slurm_gcp_plugins           = var.enable_slurm_gcp_plugins
-  epilog_scripts                     = var.epilog_scripts
-  disable_default_mounts             = var.disable_default_mounts
-  login_network_storage              = var.network_storage
-  network_storage                    = var.network_storage
-  partitions                         = var.partition
-  prolog_scripts                     = var.prolog_scripts
-  slurmdbd_conf_tpl                  = var.slurmdbd_conf_tpl
-  slurm_conf_tpl                     = var.slurm_conf_tpl
-}
-
-module "slurm_controller_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2"
-
-  additional_disks         = local.additional_disks
-  can_ip_forward           = var.can_ip_forward
-  slurm_cluster_name       = local.slurm_cluster_name
-  disable_smt              = var.disable_smt
-  disk_auto_delete         = var.disk_auto_delete
-  disk_labels              = merge(var.disk_labels, local.labels)
-  disk_size_gb             = var.disk_size_gb
-  disk_type                = var.disk_type
-  enable_confidential_vm   = var.enable_confidential_vm
-  enable_oslogin           = var.enable_oslogin
-  enable_shielded_vm       = var.enable_shielded_vm
-  gpu                      = one(local.guest_accelerator)
-  labels                   = local.labels
-  machine_type             = var.machine_type
-  metadata                 = local.metadata
-  min_cpu_platform         = var.min_cpu_platform
-  on_host_maintenance      = var.on_host_maintenance
-  preemptible              = var.preemptible
-  project_id               = var.project_id
-  region                   = var.region
-  shielded_instance_config = var.shielded_instance_config
-  slurm_instance_role      = "controller"
-  source_image_family      = local.source_image_family             # requires source_image_logic.tf
-  source_image_project     = local.source_image_project_normalized # requires source_image_logic.tf
-  source_image             = local.source_image                    # requires source_image_logic.tf
-  network                  = var.network_self_link == null ? "" : var.network_self_link
-  subnetwork_project       = var.subnetwork_project
-  subnetwork               = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link
-  tags                     = concat([local.slurm_cluster_name], var.tags)
-  service_account = var.service_account != null ? var.service_account : {
-    email  = data.google_compute_default_service_account.default.email
-    scopes = ["https://www.googleapis.com/auth/cloud-platform"]
-  }
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml
deleted file mode 100644
index d6b28f6239..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services:
-    - compute.googleapis.com
-    - iam.googleapis.com
-    - pubsub.googleapis.com
-    - secretmanager.googleapis.com
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf
deleted file mode 100644
index 86cd242f09..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-output "controller_instance_id" {
-  description = "The server-assigned unique identifier of the controller compute instance."
-  value       = one(module.slurm_controller_instance.slurm_controller_instance.instances_details[*].id)
-}
-
-output "cloud_logging_filter" {
-  description = "Cloud Logging filter to cluster errors."
-  value       = module.slurm_controller_instance.cloud_logging_filter
-}
-
-output "pubsub_topic" {
-  description = "Cluster Pub/Sub topic."
-  value       = module.slurm_controller_instance.pubsub_topic
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf
deleted file mode 100644
index 1df327a60b..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # Currently supported images and projects
-  known_project_families = {
-    schedmd-slurm-public = [
-      "slurm-gcp-5-12-debian-11",
-      "slurm-gcp-5-12-hpc-rocky-linux-8",
-      "slurm-gcp-5-12-ubuntu-2004-lts",
-      "slurm-gcp-5-12-ubuntu-2204-lts-arm64",
-      "slurm-gcp-5-12-hpc-centos-7"
-    ]
-  }
-
-  # This approach to "hacking" the project name allows a chain of Terraform
-  # calls to set the instance source_image (boot disk) with a "relative
-  # resource name" that passes muster with VPC Service Control rules
-  #
-  # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28
-  # https://cloud.google.com/apis/design/resource_names#relative_resource_name
-  source_image_project_normalized = (can(var.instance_image.family) ?
-    "projects/${data.google_compute_image.slurm.project}/global/images/family" :
-    "projects/${data.google_compute_image.slurm.project}/global/images"
-  )
-  source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : ""
-  source_image        = can(var.instance_image.name) ? data.google_compute_image.slurm.name : ""
-}
-
-data "google_compute_image" "slurm" {
-  family  = try(var.instance_image.family, null)
-  name    = try(var.instance_image.name, null)
-  project = var.instance_image.project
-
-  lifecycle {
-    precondition {
-      condition     = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0
-      error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID."
-    }
-
-    postcondition {
-      condition     = var.instance_image_custom || contains(keys(local.known_project_families), self.project)
-      error_message = <<-EOD
-      Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image.
-      EOD
-    }
-    postcondition {
-      condition     = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false)
-      error_message = <<-EOD
-      Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases:
-      ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])}
-      EOD
-    }
-    postcondition {
-      condition     = var.disk_size_gb >= self.disk_size_gb
-      error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size"
-    }
-    postcondition {
-      # Condition needs to check the suffix of the license, as prefix contains an API version which can change.
-      # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates
-      condition     = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")])
-      error_message = "Disabling automatic updates is not supported with the selected VM image.  More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates"
-    }
-  }
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf
deleted file mode 100644
index e921ba3dc6..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf
+++ /dev/null
@@ -1,671 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-# Most variables have been sourced and modified from the SchedMD/slurm-gcp
-# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-
-variable "access_config" {
-  description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet."
-  type = list(object({
-    nat_ip       = string
-    network_tier = string
-  }))
-  default = []
-}
-
-variable "additional_disks" {
-  type = list(object({
-    disk_name    = string
-    device_name  = string
-    disk_type    = string
-    disk_size_gb = number
-    disk_labels  = map(string)
-    auto_delete  = bool
-    boot         = bool
-  }))
-  description = "List of maps of disks."
-  default     = []
-}
-
-variable "can_ip_forward" {
-  type        = bool
-  description = "Enable IP forwarding, for NAT instances for example."
-  default     = false
-}
-
-variable "cloud_parameters" {
-  description = "cloud.conf options."
-  type = object({
-    no_comma_params = bool
-    resume_rate     = number
-    resume_timeout  = number
-    suspend_rate    = number
-    suspend_timeout = number
-  })
-  default = {
-    no_comma_params = false
-    resume_rate     = 0
-    resume_timeout  = 300
-    suspend_rate    = 0
-    suspend_timeout = 300
-  }
-}
-
-variable "cloudsql" {
-  description = <<EOD
-Use this database instead of the one on the controller.
-  server_ip : Address of the database server.
-  user      : The user to access the database as.
-  password  : The password, given the user, to access the given database. (sensitive)
-  db_name   : The database to access.
-EOD
-  type = object({
-    server_ip = string
-    user      = string
-    password  = string # sensitive
-    db_name   = string
-  })
-  default   = null
-  sensitive = true
-}
-
-variable "compute_startup_script" {
-  description = "Startup script used by the compute VMs."
-  type        = string
-  default     = ""
-}
-
-variable "compute_startup_scripts_timeout" {
-  description = <<-EOD
-    The timeout (seconds) applied to the compute_startup_script. If
-    any script exceeds this timeout, then the instance setup process is considered
-    failed and handled accordingly.
-
-    NOTE: When set to 0, the timeout is considered infinite and thus disabled.
-    EOD
-  type        = number
-  default     = 300
-}
-
-variable "controller_startup_script" {
-  description = "Startup script used by the controller VM."
-  type        = string
-  default     = ""
-}
-
-variable "controller_startup_scripts_timeout" {
-  description = <<-EOD
-    The timeout (seconds) applied to the controller_startup_script. If
-    any script exceeds this timeout, then the instance setup process is considered
-    failed and handled accordingly.
-
-    NOTE: When set to 0, the timeout is considered infinite and thus disabled.
-    EOD
-  type        = number
-  default     = 300
-}
-
-variable "login_startup_scripts_timeout" {
-  description = <<-EOD
-    The timeout (seconds) applied to the login startup script. If
-    any script exceeds this timeout, then the instance setup process is considered
-    failed and handled accordingly.
-
-    NOTE: When set to 0, the timeout is considered infinite and thus disabled.
-    EOD
-  type        = number
-  default     = 300
-
-  validation {
-    condition     = var.login_startup_scripts_timeout == 300
-    error_message = "Changes to login_startup_scripts_timeout (default: 300s) are not respected, this is a known issue that will be fixed in a later release"
-  }
-}
-
-variable "cgroup_conf_tpl" {
-  type        = string
-  description = "Slurm cgroup.conf template file path."
-  default     = null
-}
-
-variable "deployment_name" {
-  description = "Name of the deployment."
-  type        = string
-}
-
-variable "disable_controller_public_ips" {
-  description = "If set to false. The controller will have a random public IP assigned to it. Ignored if access_config is set."
-  type        = bool
-  default     = true
-}
-
-variable "disable_default_mounts" {
-  description = <<-EOD
-    Disable default global network storage from the controller
-    - /usr/local/etc/slurm
-    - /etc/munge
-    - /home
-    - /apps
-    Warning: If these are disabled, the slurm etc and munge dirs must be added
-    manually, or some other mechanism must be used to synchronize the slurm conf
-    files and the munge key across the cluster.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "disable_smt" {
-  type        = bool
-  description = "Disables Simultaneous Multi-Threading (SMT) on instance."
-  default     = true
-}
-
-variable "disk_type" {
-  type        = string
-  description = "Boot disk type."
-  default     = "pd-ssd"
-}
-
-variable "disk_size_gb" {
-  type        = number
-  description = "Boot disk size in GB."
-  default     = 50
-}
-
-variable "disk_auto_delete" {
-  type        = bool
-  description = "Whether or not the boot disk should be auto-deleted."
-  default     = true
-}
-
-variable "disk_labels" {
-  description = "Labels specific to the boot disk. These will be merged with var.labels."
-  type        = map(string)
-  default     = {}
-}
-
-variable "enable_devel" {
-  type        = bool
-  description = "Enables development mode. Not for production use."
-  default     = false
-}
-
-variable "enable_cleanup_compute" {
-  description = <<-EOD
-    Enables automatic cleanup of compute nodes and resource policies (e.g.
-    placement groups) managed by this module, when cluster is destroyed.
-
-    NOTE: Requires Python and pip packages listed at the following link:
-    https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt
-
-    *WARNING*: Toggling this may impact the running workload. Deployed compute nodes
-    may be destroyed and their jobs will be requeued.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_cleanup_subscriptions" {
-  description = <<-EOD
-    Enables automatic cleanup of pub/sub subscriptions managed by this module, when
-    cluster is destroyed.
-
-    NOTE: Requires Python and pip packages listed at the following link:
-    https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt
-
-    *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_reconfigure" {
-  description = <<-EOD
-    Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.
-    slurm.conf.tpl, partition details). Compute instances and resource policies
-    (e.g. placement groups) will be destroyed to align with new configuration.
-    NOTE: Requires Python and Google Pub/Sub API.
-    *WARNING*: Toggling this will impact the running workload. Deployed compute nodes
-    will be destroyed and their jobs will be requeued.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_bigquery_load" {
-  description = "Enable loading of cluster job usage into big query."
-  type        = bool
-  default     = false
-}
-
-variable "enable_slurm_gcp_plugins" {
-  description = <<EOD
-Enables calling hooks in scripts/slurm_gcp_plugins during cluster resume and suspend.
-EOD
-  type        = any
-  default     = false
-}
-
-variable "enable_oslogin" {
-  type        = bool
-  description = <<-EOD
-    Enables Google Cloud os-login for user login and authentication for VMs.
-    See https://cloud.google.com/compute/docs/oslogin
-    EOD
-  default     = true
-}
-
-variable "enable_confidential_vm" {
-  type        = bool
-  description = "Enable the Confidential VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "enable_external_prolog_epilog" {
-  description = <<EOD
-Automatically enable a script that will execute prolog and epilog scripts
-shared under /opt/apps from the controller to compute nodes.
-EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_shielded_vm" {
-  type        = bool
-  description = "Enable the Shielded VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "epilog_scripts" {
-  description = <<EOD
-List of scripts to be used for Epilog. Programs for the slurmd to execute
-on every node when a user's job completes.
-See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog.
-EOD
-  type = list(object({
-    filename = string
-    content  = string
-  }))
-  default = []
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "gpu" {
-  type = object({
-    type  = string
-    count = number
-  })
-  description = "DEPRECATED: use var.guest_accelerator"
-  default     = null
-  validation {
-    condition     = var.gpu == null
-    error_message = "var.gpu is deprecated. Use var.guest_accelerator."
-  }
-}
-
-variable "guest_accelerator" {
-  description = "List of the type and count of accelerator cards attached to the instance."
-  type = list(object({
-    type  = string,
-    count = number
-  }))
-  default  = []
-  nullable = false
-
-  validation {
-    condition     = length(var.guest_accelerator) <= 1
-    error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node."
-  }
-}
-
-variable "labels" {
-  type        = map(string)
-  description = "Labels, provided as a map."
-  default     = {}
-}
-
-variable "machine_type" {
-  type        = string
-  description = "Machine type to create."
-  default     = "c2-standard-4"
-}
-
-variable "metadata" {
-  type        = map(string)
-  description = "Metadata, provided as a map."
-  default     = {}
-}
-
-variable "min_cpu_platform" {
-  type        = string
-  description = <<EOD
-Specifies a minimum CPU platform. Applicable values are the friendly names of
-CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
-https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform
-EOD
-  default     = null
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "network_ip" {
-  type        = string
-  description = "DEPRECATED: Use `static_ips` variable to assign an internal static ip address."
-  default     = null
-  validation {
-    condition     = var.network_ip == null
-    error_message = "network_ip is deprecated. Use static_ips to assign an internal static ip address."
-  }
-}
-
-variable "network_storage" {
-  description = "An array of network attached storage mounts to be configured on all instances."
-  type = list(object({
-    server_ip             = string,
-    remote_mount          = string,
-    local_mount           = string,
-    fs_type               = string,
-    mount_options         = string,
-    client_install_runner = map(string)
-    mount_runner          = map(string)
-  }))
-  default = []
-}
-
-variable "on_host_maintenance" {
-  type        = string
-  description = "Instance availability Policy."
-  default     = "MIGRATE"
-}
-
-variable "partition" {
-  description = "Cluster partitions as a list."
-  type = list(object({
-    compute_list = list(string)
-    partition = object({
-      enable_job_exclusive    = bool
-      enable_placement_groups = bool
-      network_storage = list(object({
-        server_ip     = string
-        remote_mount  = string
-        local_mount   = string
-        fs_type       = string
-        mount_options = string
-      }))
-      partition_conf    = map(string)
-      partition_feature = string
-      partition_name    = string
-      partition_nodes = map(object({
-        access_config = list(object({
-          network_tier = string
-        }))
-        bandwidth_tier         = string
-        node_count_dynamic_max = number
-        node_count_static      = number
-        enable_spot_vm         = bool
-        group_name             = string
-        instance_template      = string
-        maintenance_interval   = string
-        node_conf              = map(string)
-        reservation_name       = string
-        spot_instance_config = object({
-          termination_action = string
-        })
-      }))
-      partition_startup_scripts_timeout = number
-      subnetwork                        = string
-      zone_policy_allow                 = list(string)
-      zone_policy_deny                  = list(string)
-      zone_target_shape                 = string
-    })
-  }))
-  default = []
-
-  validation {
-    condition = alltrue([
-      for x in var.partition[*].partition : can(regex("(^[a-z][a-z0-9]*$)", x.partition_name))
-    ])
-    error_message = "Item 'partition_name' must be alphanumeric and begin with a letter. regex: '(^[a-z][a-z0-9]*$)'."
-  }
-}
-
-variable "preemptible" {
-  type        = bool
-  description = "Allow the instance to be preempted."
-  default     = false
-}
-
-variable "project_id" {
-  type        = string
-  description = "Project ID to create resources in."
-}
-
-variable "prolog_scripts" {
-  description = <<EOD
-List of scripts to be used for Prolog. Programs for the slurmd to execute
-whenever it is asked to run a job step from a new job allocation.
-See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog.
-EOD
-  type = list(object({
-    filename = string
-    content  = string
-  }))
-  default = []
-}
-
-variable "region" {
-  type        = string
-  description = "Region where the instances should be created."
-  default     = null
-}
-
-variable "service_account" {
-  type = object({
-    email  = string
-    scopes = set(string)
-  })
-  description = <<-EOD
-    Service account to attach to the controller instance. If not set, the
-    default compute service account for the given project will be used with the
-    "https://www.googleapis.com/auth/cloud-platform" scope.
-    EOD
-  default     = null
-}
-
-variable "shielded_instance_config" {
-  type = object({
-    enable_integrity_monitoring = bool
-    enable_secure_boot          = bool
-    enable_vtpm                 = bool
-  })
-  description = <<EOD
-Shielded VM configuration for the instance. Note: not used unless
-enable_shielded_vm is 'true'.
-  enable_integrity_monitoring : Compare the most recent boot measurements to the
-  integrity policy baseline and return a pair of pass/fail results depending on
-  whether they match or not.
-  enable_secure_boot : Verify the digital signature of all boot components, and
-  halt the boot process if signature verification fails.
-  enable_vtpm : Use a virtualized trusted platform module, which is a
-  specialized computer chip you can use to encrypt objects like keys and
-  certificates.
-EOD
-  default = {
-    enable_integrity_monitoring = true
-    enable_secure_boot          = true
-    enable_vtpm                 = true
-  }
-}
-
-variable "slurm_cluster_name" {
-  type        = string
-  description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)."
-  default     = null
-}
-
-variable "slurmdbd_conf_tpl" {
-  type        = string
-  description = "Slurm slurmdbd.conf template file path."
-  default     = null
-}
-
-variable "slurm_conf_tpl" {
-  type        = string
-  description = "Slurm slurm.conf template file path."
-  default     = null
-}
-
-
-variable "instance_template" {
-  description = <<-EOD
-    Self link to a custom instance template. If set, other VM definition
-    variables such as machine_type and instance_image will be ignored in favor
-    of the provided instance template.
-
-    For more information on creating custom images for the instance template
-    that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
-    in docs/vm-images.md.
-    EOD
-  type        = string
-  default     = null
-}
-
-variable "instance_image" {
-  description = <<-EOD
-    Defines the image that will be used in the Slurm controller VM instance.
-
-    Expected Fields:
-    name: The name of the image. Mutually exclusive with family.
-    family: The image family to use. Mutually exclusive with name.
-    project: The project where the image is hosted.
-
-    For more information on creating custom images that comply with Slurm on GCP
-    see the "Slurm on GCP Custom Images" section in docs/vm-images.md.
-    EOD
-  type        = map(string)
-  default = {
-    project = "schedmd-slurm-public"
-    family  = "slurm-gcp-5-12-hpc-centos-7"
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.project))
-    error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID."
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family))
-    error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name."
-  }
-}
-
-variable "instance_image_custom" {
-  description = <<-EOD
-    A flag that designates that the user is aware that they are requesting
-    to use a custom and potentially incompatible image for this Slurm on
-    GCP module.
-
-    If the field is set to false, only the compatible families and project
-    names will be accepted.  The deployment will fail with any other image
-    family or name.  If set to true, no checks will be done.
-
-    See: https://goo.gle/hpc-slurm-images
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "allow_automatic_updates" {
-  description = <<-EOT
-  If false, disables automatic system package updates on the created instances.  This feature is
-  only available on supported images (or images derived from them).  For more details, see
-  https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates
-  EOT
-  type        = bool
-  default     = true
-  nullable    = false
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_project" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_project == null
-    error_message = "Variable `source_image_project` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_family" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_family == null
-    error_message = "Variable `source_image_family` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image == null
-    error_message = "Variable `source_image` is deprecated. Use `instance_image` instead."
-  }
-}
-
-variable "static_ips" {
-  type        = list(string)
-  description = "List of static IPs for VM instances."
-  default     = []
-}
-
-variable "network_self_link" {
-  type        = string
-  description = "Network to deploy to. Either network_self_link or subnetwork_self_link must be specified."
-  default     = null
-}
-
-variable "subnetwork_self_link" {
-  type        = string
-  description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified."
-  default     = null
-}
-
-variable "subnetwork_project" {
-  type        = string
-  description = "The project that subnetwork belongs to."
-  default     = null
-}
-
-variable "tags" {
-  type        = list(string)
-  description = "Network tag list."
-  default     = []
-}
-
-variable "zone" {
-  type        = string
-  description = <<EOD
-Zone where the instances should be created. If not specified, instances will be
-spread across available zones in the region.
-EOD
-  default     = null
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
deleted file mode 100644
index 37ce7db691..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-terraform {
-  required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = ">= 3.83"
-    }
-  }
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.44.0"
-  }
-  required_version = ">= 1.1"
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
deleted file mode 100644
index 73b6c5fcb0..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
+++ /dev/null
@@ -1,230 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-hybrid module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD
-as part of the [slurm-gcp] github repository. The hybrid module serves to create
-the configurations needed to extend an on-premise slurm cluster to one with one
-or more Google Cloud bursting partitions. These partitions will create the
-requested nodes in a GCP project on-demand and scale after a period of not being
-used, in the same way as the [schedmd-slurm-gcp-v5-controller] module
-auto-scales VMs.
-
-Further documentation on how to use this module when deploying a hybrid Slurm
-cluster can be found in our [docs](../../../../docs/hybrid-slurm-cluster/). There, you can
-find two tutorials. The [first] tutorial walks you through deploying a test
-environment entirely in GCP that is designed to demonstrate the capabilities
-without needing to make any changes to your local slurm cluster. The [second]
-tutorial goes through the process of deploying the hybrid configuration onto a
-on-premise slurm cluster.
-
-> **_NOTE:_** This is an experimental module and the functionality and
-> documentation will likely be updated in the near future. This module has only
-> been tested in limited capacity with the Cluster Toolkit. On Premise
-> Slurm configurations can vary significantly, this module should
-> be used as a starting point, not a complete solution.
-
-[schedmd-slurm-gcp-v5-controller]: ../schedmd-slurm-gcp-v5-controller/
-[first]: ../../../../docs/hybrid-slurm-cluster/README.md#demo-with-cloud-controller-instructionsmd
-[second]: ../../../../docs/hybrid-slurm-cluster/README.md#on-prem-instructionsmd
-
-### Usage
-The [slurm-controller-hybrid] is intended to be run on the controller of the on
-premise slurm cluster, meaning executing `terraform init/apply` against the
-deployment directory. This allows the module to infer settings such as the
-slurm user and user ID when setting permissions for the created configurations.
-
-If unable to install terraform and other dependencies on the controller
-directly, it is possible to deploy the hybrid module in a separate build
-environment and copy the created configurations to the on premise controller
-manually. This will require addition configuration and verification of
-permissions. For more information see the [hybrid.md] documentation on
-[slurm-gcp].
-
-[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_hybrid
-
-> **_NOTE:_** The hybrid module requires the following dependencies to be
-> installed on the system deploying the module:
->
-> * [terraform]
-> * [addict]
-> * [httplib2]
-> * [pyyaml]
-> * [google-api-python-client]
-> * [google-cloud-pubsub]
-> * A full list of recommended python packages is available in a
->   [requirements.txt] file in the [slurm-gcp] repo.
-
-[terraform]: https://learn.hashicorp.com/tutorials/terraform/install-cli
-[addict]: https://pypi.org/project/addict/
-[httplib2]: https://pypi.org/project/httplib2/
-[pyyaml]: https://pypi.org/project/PyYAML/
-[google-api-python-client]: https://pypi.org/project/google-api-python-client/
-[google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/
-[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt
-
-### Manual Configuration
-This module *does not* complete the installation of hybrid partitions on your
-slurm cluster. After deploying, you must follow the steps listed out in the
-[hybrid.md] documentation under [manual steps].
-
-[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md
-[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md#manual-configurations
-
-### Example Usage
-The hybrid module can be added to a blueprint as follows:
-
-```yaml
-- id: slurm-controller
-  source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid
-  use:
-  - debug-partition
-  - compute-partition
-  - pre-existing-storage
-  settings:
-    output_dir: ./hybrid
-    slurm_bin_dir: /usr/local/bin
-    slurm_control_host: static-controller
-```
-
-This defines a HPC module that create a hybrid configuration with the following
-attributes:
-
-* 2 partitions defined in previous modules with the IDs of `debug-partition` and
-  `compute-partition`. These are the same partition modules used by
-  [schedmd-slurm-gcp-v5-controller].
-* Network storage to be mounted on the compute nodes when created, defined in
-  `pre-existing-storage`.
-* `output_directory` set to `./hybrid`. This is where the hybrid
-  configurations will be created.
-* `slurm_bin_dir` located at `/usr/local/bin`. Set this to wherever the slurm
-  executables are installed on your system.
-* `slurm_control_host`: The name of the on premise host is provided to the
-  module for configuring NFS mounts and communicating with the controller after
-  VM creation.
-
-[schedmd-slurm-gcp-v5-controller]: ../schedmd-slurm-gcp-v5-controller/
-
-### Assumptions and Limitations
-**Shared directories from the controller:** By default, the following
-directories are NFS mounted from the on premise controller to the created cloud
-VMs:
-* /home
-* /opt/apps
-* /etc/munge
-* /usr/local/slurm/etc
-
-The expectation is that these directories exist on the controller and that all
-files required by slurmd to be in sync with the controller are in those
-directories.
-
-If this does not match your slurm cluster, these directories can be overwritten
-with a custom NFS mount using [pre-existing-network-storage] or by setting the
-`network_storage` variable directly in the hybrid module. **Any value in
-`network_storage`, added directly or with `use`, will override the default
-directories above.**
-
-The variable `disable_default_mounts` will disregard these defaults. Note that
-at a minimum, the cloud VMs require `/etc/munge` and `/usr/local/slurm/etc` to
-be mounted from the controller. Those will need to be managed manually if the
-`disable_default_mounts` variable is set to true.
-
-**Power Saving Logic:** The cloud partitions will make use of the power saving
-logic and the suspend and resume programs will be set. If any local partitions
-also make use of these `slurm.conf` variables, a conflict will likely occur.
-There is no support currently for partition level suspend and resume scripts,
-therefore either the local partition will need to turn this off or the hybrid
-module will not work.
-
-**Slurm versions:** The version of slurm on the on premise cluster must match the
-slurm version on the cloud VMs created by the hybrid partitions. The version
-on the cloud VMs will be dictated by the version on the disk image that can be
-set when defining the partitions using [schedmd-slurm-gcp-v5-partition].
-
-If the publicly available images do not suffice, [slurm-gcp] provides
-[packer templates] for creating custom disk images.
-
-SchedMD only supports the current and last major version of slurm, therefore we
-strongly advise only using versions 21 or 22 when using this module. Attempting
-to use this module with any version older than 21 may lead to unexpected
-results.
-
-[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2
-[pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/
-[schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/
-[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer
-
-## License
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2022 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.14.0 |
-
-## Providers
-
-No providers.
-
-## Modules
-
-| Name | Source | Version |
-|------|--------|---------|
-| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.2 |
-
-## Resources
-
-No resources.
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_cloud_parameters"></a> [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. | <pre>object({<br/>    no_comma_params = bool<br/>    resume_rate     = number<br/>    resume_timeout  = number<br/>    suspend_rate    = number<br/>    suspend_timeout = number<br/>  })</pre> | <pre>{<br/>  "no_comma_params": false,<br/>  "resume_rate": 0,<br/>  "resume_timeout": 300,<br/>  "suspend_rate": 0,<br/>  "suspend_timeout": 300<br/>}</pre> | no |
-| <a name="input_compute_startup_script"></a> [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no |
-| <a name="input_compute_startup_scripts_timeout"></a> [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to the compute\_startup\_script. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
-| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes |
-| <a name="input_disable_default_mounts"></a> [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller: /usr/local/etc/slurm,<br/>/etc/munge, /home, /apps.<br/>If these are disabled, the slurm etc and munge dirs must be added manually,<br/>or some other mechanism must be used to synchronize the slurm conf files<br/>and the munge key across the cluster. | `bool` | `false` | no |
-| <a name="input_enable_bigquery_load"></a> [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.<br/>NOTE: Requires Google Bigquery API. | `bool` | `false` | no |
-| <a name="input_enable_cleanup_compute"></a> [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.<br/>placement groups) managed by this module, when cluster is destroyed.<br/>NOTE: Requires Python and script dependencies.<br/>*WARNING*: Toggling this may impact the running workload. Deployed compute nodes<br/>may be destroyed and their jobs will be requeued. | `bool` | `false` | no |
-| <a name="input_enable_cleanup_subscriptions"></a> [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when<br/>cluster is destroyed.<br/>NOTE: Requires Python and script dependencies.<br/>*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no |
-| <a name="input_enable_devel"></a> [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no |
-| <a name="input_enable_reconfigure"></a> [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.<br/>slurm.conf.tpl, partition details). Compute instances and resource policies<br/>(e.g. placement groups) will be destroyed to align with new configuration.<br/>NOTE: Requires Python and Google Pub/Sub API.<br/>*WARNING*: Toggling this will impact the running workload. Deployed compute nodes<br/>will be destroyed and their jobs will be requeued. | `bool` | `false` | no |
-| <a name="input_enable_slurm_gcp_plugins"></a> [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `bool` | `false` | no |
-| <a name="input_epilog_scripts"></a> [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute<br/>on every node when a user's job completes.<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. | <pre>list(object({<br/>    filename = string<br/>    content  = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_google_app_cred_path"></a> [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no |
-| <a name="input_install_dir"></a> [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the<br/>on-premise controller. This updates the prefix path for the resume and<br/>suspend scripts in the generated `cloud.conf` file. The value defaults to<br/>output\_dir if not specified. | `string` | `null` | no |
-| <a name="input_munge_mount"></a> [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.<br/><br/>By default, the munge mount server will be assumed to be the<br/>`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when<br/>`server_ip=null`. | <pre>object({<br/>    server_ip     = string<br/>    remote_mount  = string<br/>    fs_type       = string<br/>    mount_options = string<br/>  })</pre> | <pre>{<br/>  "fs_type": "nfs",<br/>  "mount_options": "",<br/>  "remote_mount": "/etc/munge/",<br/>  "server_ip": null<br/>}</pre> | no |
-| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = map(string)<br/>    mount_runner          = map(string)<br/>  }))</pre> | `[]` | no |
-| <a name="input_output_dir"></a> [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:<br/>cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py.<br/>If not specified explicitly, this will also be used as the default value<br/>for the `install_dir` variable. | `string` | `null` | no |
-| <a name="input_partition"></a> [partition](#input\_partition) | Cluster partitions as a list. | <pre>list(object({<br/>    compute_list = list(string)<br/>    partition = object({<br/>      enable_job_exclusive    = bool<br/>      enable_placement_groups = bool<br/>      network_storage = list(object({<br/>        server_ip     = string<br/>        remote_mount  = string<br/>        local_mount   = string<br/>        fs_type       = string<br/>        mount_options = string<br/>      }))<br/>      partition_conf    = map(string)<br/>      partition_feature = string<br/>      partition_name    = string<br/>      partition_nodes = map(object({<br/>        bandwidth_tier         = string<br/>        node_count_dynamic_max = number<br/>        node_count_static      = number<br/>        enable_spot_vm         = bool<br/>        group_name             = string<br/>        instance_template      = string<br/>        node_conf              = map(string)<br/>        access_config = list(object({<br/>          nat_ip       = string<br/>          network_tier = string<br/>        }))<br/>        spot_instance_config = object({<br/>          termination_action = string<br/>        })<br/>      }))<br/>      partition_startup_scripts_timeout = number<br/>      subnetwork                        = string<br/>      zone_policy_allow                 = list(string)<br/>      zone_policy_deny                  = list(string)<br/>      zone_target_shape                 = string<br/>    })<br/>  }))</pre> | `[]` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
-| <a name="input_prolog_scripts"></a> [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute<br/>whenever it is asked to run a job step from a new job allocation.<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. | <pre>list(object({<br/>    filename = string<br/>    content  = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_slurm_bin_dir"></a> [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',<br/>then it will be assumed that binaries are in $PATH. | `string` | `null` | no |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided<br/>it will default to the first 8 characters of the deployment name (removing<br/>any invalid characters). | `string` | `null` | no |
-| <a name="input_slurm_control_addr"></a> [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.<br/>This value is passed to slurm.conf such that:<br/>SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no |
-| <a name="input_slurm_control_host"></a> [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is<br/>executed (i.e. the name returned by the command "hostname -s").<br/>This value is passed to slurm.conf such that:<br/>SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | n/a | yes |
-| <a name="input_slurm_control_host_port"></a> [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.<br/>See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `null` | no |
-| <a name="input_slurm_log_dir"></a> [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no |
-
-## Outputs
-
-No outputs.
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf
deleted file mode 100644
index c721a13bb3..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-locals {
-  ghpc_startup_script_compute = [{
-    filename = "ghpc_startup.sh"
-    content  = var.compute_startup_script
-  }]
-
-  # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning
-  # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string
-  tmp_cluster_name   = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10)
-  slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name
-
-}
-
-module "slurm_controller_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.2"
-
-  project_id                      = var.project_id
-  slurm_cluster_name              = local.slurm_cluster_name
-  enable_devel                    = var.enable_devel
-  enable_cleanup_compute          = var.enable_cleanup_compute
-  enable_cleanup_subscriptions    = var.enable_cleanup_subscriptions
-  enable_reconfigure              = var.enable_reconfigure
-  enable_bigquery_load            = var.enable_bigquery_load
-  enable_slurm_gcp_plugins        = var.enable_slurm_gcp_plugins
-  compute_startup_scripts         = local.ghpc_startup_script_compute
-  compute_startup_scripts_timeout = var.compute_startup_scripts_timeout
-  prolog_scripts                  = var.prolog_scripts
-  epilog_scripts                  = var.epilog_scripts
-  network_storage                 = var.network_storage
-  disable_default_mounts          = var.disable_default_mounts
-  login_network_storage           = var.network_storage
-  partitions                      = var.partition
-  google_app_cred_path            = var.google_app_cred_path
-  slurm_bin_dir                   = var.slurm_bin_dir
-  slurm_log_dir                   = var.slurm_log_dir
-  cloud_parameters                = var.cloud_parameters
-  output_dir                      = var.output_dir
-  slurm_control_host              = var.slurm_control_host
-  slurm_control_host_port         = var.slurm_control_host_port
-  slurm_control_addr              = var.slurm_control_addr
-  install_dir                     = var.install_dir
-  munge_mount                     = var.munge_mount
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml
deleted file mode 100644
index af2107286d..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services:
-    - compute.googleapis.com
-    - pubsub.googleapis.com
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf
deleted file mode 100644
index 1630e92708..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf
+++ /dev/null
@@ -1,344 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-variable "project_id" {
-  type        = string
-  description = "Project ID to create resources in."
-}
-
-variable "deployment_name" {
-  description = "Name of the deployment."
-  type        = string
-}
-
-variable "slurm_cluster_name" {
-  type        = string
-  description = <<-EOD
-    Cluster name, used for resource naming and slurm accounting. If not provided
-    it will default to the first 8 characters of the deployment name (removing
-    any invalid characters).
-    EOD
-  default     = null
-}
-
-variable "enable_devel" {
-  type        = bool
-  description = "Enables development mode. Not for production use."
-  default     = false
-}
-
-variable "enable_cleanup_compute" {
-  description = <<-EOD
-    Enables automatic cleanup of compute nodes and resource policies (e.g.
-    placement groups) managed by this module, when cluster is destroyed.
-    NOTE: Requires Python and script dependencies.
-    *WARNING*: Toggling this may impact the running workload. Deployed compute nodes
-    may be destroyed and their jobs will be requeued.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_cleanup_subscriptions" {
-  description = <<-EOD
-    Enables automatic cleanup of pub/sub subscriptions managed by this module, when
-    cluster is destroyed.
-    NOTE: Requires Python and script dependencies.
-    *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_reconfigure" {
-  description = <<-EOD
-    Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
-    slurm.conf.tpl, partition details). Compute instances and resource policies
-    (e.g. placement groups) will be destroyed to align with new configuration.
-    NOTE: Requires Python and Google Pub/Sub API.
-    *WARNING*: Toggling this will impact the running workload. Deployed compute nodes
-    will be destroyed and their jobs will be requeued.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_bigquery_load" {
-  description = <<-EOD
-    Enables loading of cluster job usage into big query.
-    NOTE: Requires Google Bigquery API.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "enable_slurm_gcp_plugins" {
-  description = <<EOD
-Enables calling hooks in scripts/slurm_gcp_plugins during cluster resume and suspend.
-EOD
-  type        = bool
-  default     = false
-}
-
-variable "slurm_control_host" {
-  type        = string
-  description = <<EOD
-The short, or long, hostname of the machine where Slurm control daemon is
-executed (i.e. the name returned by the command "hostname -s").
-This value is passed to slurm.conf such that:
-SlurmctldHost={var.slurm_control_host}\({var.slurm_control_addr}\)
-See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost
-EOD
-
-  validation {
-    condition     = (var.slurm_control_host != null && var.slurm_control_host != "")
-    error_message = "Variable 'slurm_control_host' cannot be empty (\"\") or omitted (null)."
-  }
-}
-
-variable "slurm_control_host_port" {
-  type        = string
-  description = <<EOD
-The port number that the Slurm controller, slurmctld, listens to for work.
-See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort
-EOD
-  default     = null
-
-  validation {
-    condition     = var.slurm_control_host_port != ""
-    error_message = "Variable 'slurm_control_host_port' cannot be empty (\"\")."
-  }
-}
-
-variable "slurm_control_addr" {
-  type        = string
-  description = <<EOD
-The IP address or a name by which the address can be identified.
-This value is passed to slurm.conf such that:
-SlurmctldHost={var.slurm_control_host}\({var.slurm_control_addr}\)
-See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost
-EOD
-  default     = null
-
-  validation {
-    condition     = var.slurm_control_addr != ""
-    error_message = "Variable 'slurm_control_addr' cannot be empty (\"\")."
-  }
-}
-
-variable "compute_startup_script" {
-  description = "Startup script used by the compute VMs."
-  type        = string
-  default     = ""
-}
-
-variable "compute_startup_scripts_timeout" {
-  description = <<-EOD
-    The timeout (seconds) applied to the compute_startup_script. If
-    any script exceeds this timeout, then the instance setup process is considered
-    failed and handled accordingly.
-
-    NOTE: When set to 0, the timeout is considered infinite and thus disabled.
-    EOD
-  type        = number
-  default     = 300
-}
-
-variable "prolog_scripts" {
-  description = <<-EOD
-    List of scripts to be used for Prolog. Programs for the slurmd to execute
-    whenever it is asked to run a job step from a new job allocation.
-    See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog.
-    EOD
-  type = list(object({
-    filename = string
-    content  = string
-  }))
-  default = []
-}
-
-variable "epilog_scripts" {
-  description = <<-EOD
-    List of scripts to be used for Epilog. Programs for the slurmd to execute
-    on every node when a user's job completes.
-    See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog.
-    EOD
-  type = list(object({
-    filename = string
-    content  = string
-  }))
-  default = []
-}
-
-variable "disable_default_mounts" {
-  description = <<-EOD
-    Disable default global network storage from the controller: /usr/local/etc/slurm,
-    /etc/munge, /home, /apps.
-    If these are disabled, the slurm etc and munge dirs must be added manually,
-    or some other mechanism must be used to synchronize the slurm conf files
-    and the munge key across the cluster.
-    EOD
-  type        = bool
-  default     = false
-}
-
-variable "network_storage" {
-  description = "An array of network attached storage mounts to be configured on all instances."
-  type = list(object({
-    server_ip             = string,
-    remote_mount          = string,
-    local_mount           = string,
-    fs_type               = string,
-    mount_options         = string,
-    client_install_runner = map(string)
-    mount_runner          = map(string)
-  }))
-  default = []
-}
-
-variable "partition" {
-  description = "Cluster partitions as a list."
-  type = list(object({
-    compute_list = list(string)
-    partition = object({
-      enable_job_exclusive    = bool
-      enable_placement_groups = bool
-      network_storage = list(object({
-        server_ip     = string
-        remote_mount  = string
-        local_mount   = string
-        fs_type       = string
-        mount_options = string
-      }))
-      partition_conf    = map(string)
-      partition_feature = string
-      partition_name    = string
-      partition_nodes = map(object({
-        bandwidth_tier         = string
-        node_count_dynamic_max = number
-        node_count_static      = number
-        enable_spot_vm         = bool
-        group_name             = string
-        instance_template      = string
-        node_conf              = map(string)
-        access_config = list(object({
-          nat_ip       = string
-          network_tier = string
-        }))
-        spot_instance_config = object({
-          termination_action = string
-        })
-      }))
-      partition_startup_scripts_timeout = number
-      subnetwork                        = string
-      zone_policy_allow                 = list(string)
-      zone_policy_deny                  = list(string)
-      zone_target_shape                 = string
-    })
-  }))
-  default = []
-
-  validation {
-    condition = alltrue([
-      for x in var.partition[*].partition : can(regex("(^[a-z][a-z0-9]*$)", x.partition_name))
-    ])
-    error_message = "Item 'partition_name' must be alphanumeric and begin with a letter. regex: '(^[a-z][a-z0-9]*$)'."
-  }
-}
-
-variable "google_app_cred_path" {
-  type        = string
-  description = "Path to Google Application Credentials."
-  default     = null
-}
-
-variable "slurm_bin_dir" {
-  type        = string
-  description = <<-EOD
-    Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
-    then it will be assumed that binaries are in $PATH.
-    EOD
-  default     = null
-}
-
-variable "slurm_log_dir" {
-  type        = string
-  description = "Directory where Slurm logs to."
-  default     = "/var/log/slurm"
-}
-
-variable "cloud_parameters" {
-  description = "cloud.conf options."
-  type = object({
-    no_comma_params = bool
-    resume_rate     = number
-    resume_timeout  = number
-    suspend_rate    = number
-    suspend_timeout = number
-  })
-  default = {
-    no_comma_params = false
-    resume_rate     = 0
-    resume_timeout  = 300
-    suspend_rate    = 0
-    suspend_timeout = 300
-  }
-}
-
-variable "output_dir" {
-  type        = string
-  description = <<-EOD
-    Directory where this module will write its files to. These files include:
-    cloud.conf; cloud_gres.conf; config.yaml; resume.py; suspend.py; and util.py.
-    If not specified explicitly, this will also be used as the default value
-    for the `install_dir` variable.
-    EOD
-  default     = null
-}
-
-variable "install_dir" {
-  description = <<-EOD
-    Directory where the hybrid configuration directory will be installed on the
-    on-premise controller. This updates the prefix path for the resume and
-    suspend scripts in the generated `cloud.conf` file. The value defaults to
-    output_dir if not specified.
-    EOD
-  type        = string
-  default     = null
-}
-
-variable "munge_mount" {
-  description = <<-EOD
-  Remote munge mount for compute and login nodes to acquire the munge.key.
-
-  By default, the munge mount server will be assumed to be the
-  `var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
-  `server_ip=null`.
-  EOD
-  type = object({
-    server_ip     = string
-    remote_mount  = string
-    fs_type       = string
-    mount_options = string
-  })
-  default = {
-    server_ip     = null
-    remote_mount  = "/etc/munge/"
-    fs_type       = "nfs"
-    mount_options = ""
-  }
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/versions.tf
deleted file mode 100644
index d132ba90de..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/versions.tf
+++ /dev/null
@@ -1,19 +0,0 @@
-/**
- * Copyright 2021 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-terraform {
-  required_version = ">= 0.14.0"
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
deleted file mode 100644
index 787ece124c..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
+++ /dev/null
@@ -1,153 +0,0 @@
-## Description
-
-> [!NOTE]
-> Slurm-gcp-v5-login module is deprecated. See
-> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
-> for specific recommendations and timelines.
-
-This module creates a login node for a Slurm cluster based on the
-[SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance]
-terraform modules. The login node is used in conjunction with the
-[Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md).
-
-[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2
-[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_login_instance
-[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template
-
-### Example
-
-```yaml
-- id: slurm_login
-  source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
-  use:
-  - network1
-  - slurm_controller
-  settings:
-    machine_type: n2-standard-4
-```
-
-This creates a Slurm login node which is:
-
-* connected to the primary subnet of network1 via `use`
-* associated with the `slurm_controller` module as the slurm controller via
-  `use`
-* of VM machine type `n2-standard-4`
-
-## Custom Images
-
-For more information on creating valid custom images for the login node VM
-instances or for custom instance templates, see our [vm-images.md] documentation
-page.
-
-[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images
-
-## GPU Support
-
-More information on GPU support in Slurm on GCP and other Cluster Toolkit modules
-can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md)
-
-## Support
-The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
-modules. For support with the underlying modules, see the instructions in the
-[slurm-gcp README][slurm-gcp-readme].
-
-[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2#slurm-on-google-cloud-platform
-
-## License
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-Copyright 2023 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-## Requirements
-
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.1 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.83 |
-
-## Providers
-
-| Name | Version |
-|------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | >= 3.83 |
-
-## Modules
-
-| Name | Source | Version |
-|------|--------|---------|
-| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.2 |
-| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 |
-
-## Resources
-
-| Name | Type |
-|------|------|
-| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
-| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source |
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_access_config"></a> [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. | <pre>list(object({<br/>    nat_ip       = string<br/>    network_tier = string<br/>  }))</pre> | `[]` | no |
-| <a name="input_additional_disks"></a> [additional\_disks](#input\_additional\_disks) | List of maps of disks. | <pre>list(object({<br/>    disk_name    = string<br/>    device_name  = string<br/>    disk_type    = string<br/>    disk_size_gb = number<br/>    disk_labels  = map(string)<br/>    auto_delete  = bool<br/>    boot         = bool<br/>  }))</pre> | `[]` | no |
-| <a name="input_allow_automatic_updates"></a> [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances.  This feature is<br/>only available on supported images (or images derived from them).  For more details, see<br/>https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no |
-| <a name="input_can_ip_forward"></a> [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no |
-| <a name="input_controller_instance_id"></a> [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance. This value<br/>must be supplied as an output of the controller module, typically via `use`. | `string` | n/a | yes |
-| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes |
-| <a name="input_disable_login_public_ips"></a> [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no |
-| <a name="input_disable_smt"></a> [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no |
-| <a name="input_disk_auto_delete"></a> [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no |
-| <a name="input_disk_labels"></a> [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no |
-| <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no |
-| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no |
-| <a name="input_enable_confidential_vm"></a> [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_enable_oslogin"></a> [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.<br/>See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no |
-| <a name="input_enable_reconfigure"></a> [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.<br/>slurm.conf.tpl, partition details).<br/><br/>NOTE: Requires Google Pub/Sub API. | `bool` | `false` | no |
-| <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
-| <a name="input_gpu"></a> [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator | <pre>object({<br/>    type  = string<br/>    count = number<br/>  })</pre> | `null` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-5-12-hpc-centos-7",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
-| <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
-| <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition<br/>variables such as machine\_type and instance\_image will be ignored in favor<br/>of the provided instance template.<br/><br/>For more information on creating custom images for the instance template<br/>that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section<br/>in docs/vm-images.md. | `string` | `null` | no |
-| <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no |
-| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"n2-standard-2"` | no |
-| <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no |
-| <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of<br/>CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:<br/>https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no |
-| <a name="input_network_ip"></a> [network\_ip](#input\_network\_ip) | DEPRECATED: Use `static_ips` variable to assign an internal static ip address. | `string` | `null` | no |
-| <a name="input_network_self_link"></a> [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no |
-| <a name="input_num_instances"></a> [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no |
-| <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no |
-| <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
-| <a name="input_pubsub_topic"></a> [pubsub\_topic](#input\_pubsub\_topic) | The cluster pubsub topic created by the controller when enable\_reconfigure=true. | `string` | `null` | no |
-| <a name="input_region"></a> [region](#input\_region) | Region where the instances should be created.<br/>Note: region will be ignored if it can be extracted from subnetwork. | `string` | `null` | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the login instance. If not set, the<br/>default compute service account for the given project will be used with the<br/>"https://www.googleapis.com/auth/cloud-platform" scope. | <pre>object({<br/>    email  = string<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
-| <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless<br/>enable\_shielded\_vm is 'true'.<br/>- enable\_integrity\_monitoring : Compare the most recent boot measurements to the<br/>  integrity policy baseline and return a pair of pass/fail results depending on<br/>  whether they match or not.<br/>- enable\_secure\_boot : Verify the digital signature of all boot components, and<br/>  halt the boot process if signature verification fails.<br/>- enable\_vtpm : Use a virtualized trusted platform module, which is a<br/>  specialized computer chip you can use to encrypt objects like keys and<br/>  certificates. | <pre>object({<br/>    enable_integrity_monitoring = bool<br/>    enable_secure_boot          = bool<br/>    enable_vtpm                 = bool<br/>  })</pre> | <pre>{<br/>  "enable_integrity_monitoring": true,<br/>  "enable_secure_boot": true,<br/>  "enable_vtpm": true<br/>}</pre> | no |
-| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no |
-| <a name="input_source_image"></a> [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_family"></a> [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_source_image_project"></a> [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no |
-| <a name="input_startup_script"></a> [startup\_script](#input\_startup\_script) | Startup script that will be used by the login node VM. | `string` | `""` | no |
-| <a name="input_static_ips"></a> [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no |
-| <a name="input_subnetwork_project"></a> [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no |
-| <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no |
-| <a name="input_tags"></a> [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no |
-| <a name="input_zone"></a> [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be<br/>spread across available zones in the region. | `string` | `null` | no |
-
-## Outputs
-
-No outputs.
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
deleted file mode 100644
index 1c84a92721..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-## Required variables:
-#  guest_accelerator
-#  machine_type
-
-locals {
-  # example state; terraform will ignore diffs if last element of URL matches
-  # guest_accelerator = [
-  #   {
-  #     count = 1
-  #     type  = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
-  #   },
-  # ]
-  accelerator_machines = {
-    "a2-highgpu-1g"  = { type = "nvidia-tesla-a100", count = 1 },
-    "a2-highgpu-2g"  = { type = "nvidia-tesla-a100", count = 2 },
-    "a2-highgpu-4g"  = { type = "nvidia-tesla-a100", count = 4 },
-    "a2-highgpu-8g"  = { type = "nvidia-tesla-a100", count = 8 },
-    "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
-    "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
-    "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
-    "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
-    "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
-    "a3-highgpu-8g"  = { type = "nvidia-h100-80gb", count = 8 },
-    "a3-megagpu-8g"  = { type = "nvidia-h100-mega-80gb", count = 8 },
-    "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
-    "g2-standard-4"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-8"  = { type = "nvidia-l4", count = 1 },
-    "g2-standard-12" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-16" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-24" = { type = "nvidia-l4", count = 2 },
-    "g2-standard-32" = { type = "nvidia-l4", count = 1 },
-    "g2-standard-48" = { type = "nvidia-l4", count = 4 },
-    "g2-standard-96" = { type = "nvidia-l4", count = 8 },
-  }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
-
-  # Select in priority order:
-  # (1) var.guest_accelerator if not empty
-  # (2) local.generated_guest_accelerator if not empty
-  # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf
deleted file mode 100644
index 3046dbac9d..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # This label allows for billing report tracking based on module.
-  labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-login", ghpc_role = "scheduler" })
-}
-
-locals {
-  disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
-
-  metadata = merge(
-    local.disable_automatic_updates_metadata,
-    var.metadata
-  )
-
-  ghpc_startup_script = [{
-    filename = "ghpc_startup.sh"
-    content  = var.startup_script
-  }]
-  # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning
-  # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string
-  tmp_cluster_name   = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10)
-  slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name
-
-  enable_public_ip_access_config = var.disable_login_public_ips ? [] : [{ nat_ip = null, network_tier = null }]
-  access_config                  = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config
-
-  additional_disks = [
-    for ad in var.additional_disks : {
-      disk_name    = ad.disk_name
-      device_name  = ad.device_name
-      disk_type    = ad.disk_type
-      disk_size_gb = ad.disk_size_gb
-      disk_labels  = merge(ad.disk_labels, local.labels)
-      auto_delete  = ad.auto_delete
-      boot         = ad.boot
-    }
-  ]
-}
-
-data "google_compute_default_service_account" "default" {
-  project = var.project_id
-}
-
-module "slurm_login_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2"
-
-  additional_disks         = local.additional_disks
-  can_ip_forward           = var.can_ip_forward
-  slurm_cluster_name       = local.slurm_cluster_name
-  disable_smt              = var.disable_smt
-  disk_auto_delete         = var.disk_auto_delete
-  disk_labels              = merge(var.disk_labels, local.labels)
-  disk_size_gb             = var.disk_size_gb
-  disk_type                = var.disk_type
-  enable_confidential_vm   = var.enable_confidential_vm
-  enable_oslogin           = var.enable_oslogin
-  enable_shielded_vm       = var.enable_shielded_vm
-  gpu                      = one(local.guest_accelerator)
-  labels                   = local.labels
-  machine_type             = var.machine_type
-  metadata                 = local.metadata
-  min_cpu_platform         = var.min_cpu_platform
-  on_host_maintenance      = var.on_host_maintenance
-  preemptible              = var.preemptible
-  project_id               = var.project_id
-  region                   = var.region
-  shielded_instance_config = var.shielded_instance_config
-  slurm_instance_role      = "login"
-  source_image_family      = local.source_image_family             # requires source_image_logic.tf
-  source_image_project     = local.source_image_project_normalized # requires source_image_logic.tf
-  source_image             = local.source_image                    # requires source_image_logic.tf
-  network                  = var.network_self_link == null ? "" : var.network_self_link
-  subnetwork_project       = var.subnetwork_project
-  subnetwork               = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link
-  tags                     = concat([local.slurm_cluster_name], var.tags)
-  service_account = var.service_account != null ? var.service_account : {
-    email  = data.google_compute_default_service_account.default.email
-    scopes = ["https://www.googleapis.com/auth/cloud-platform"]
-  }
-}
-
-module "slurm_login_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.2"
-
-  access_config         = local.access_config
-  slurm_cluster_name    = local.slurm_cluster_name
-  instance_template     = var.instance_template != null ? var.instance_template : module.slurm_login_template.self_link
-  network               = var.network_self_link
-  num_instances         = var.num_instances
-  project_id            = var.project_id
-  region                = var.region
-  static_ips            = var.static_ips
-  subnetwork_project    = var.subnetwork_project
-  subnetwork            = var.subnetwork_self_link
-  zone                  = var.zone
-  login_startup_scripts = local.ghpc_startup_script
-  metadata              = local.metadata
-  slurm_depends_on      = var.controller_instance_id == null ? [] : [var.controller_instance_id]
-  enable_reconfigure    = var.enable_reconfigure
-  pubsub_topic          = var.pubsub_topic
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml
deleted file mode 100644
index 4c2f23a8d7..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-spec:
-  requirements:
-    services:
-    - compute.googleapis.com
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf
deleted file mode 100644
index 1df327a60b..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-locals {
-  # Currently supported images and projects
-  known_project_families = {
-    schedmd-slurm-public = [
-      "slurm-gcp-5-12-debian-11",
-      "slurm-gcp-5-12-hpc-rocky-linux-8",
-      "slurm-gcp-5-12-ubuntu-2004-lts",
-      "slurm-gcp-5-12-ubuntu-2204-lts-arm64",
-      "slurm-gcp-5-12-hpc-centos-7"
-    ]
-  }
-
-  # This approach to "hacking" the project name allows a chain of Terraform
-  # calls to set the instance source_image (boot disk) with a "relative
-  # resource name" that passes muster with VPC Service Control rules
-  #
-  # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28
-  # https://cloud.google.com/apis/design/resource_names#relative_resource_name
-  source_image_project_normalized = (can(var.instance_image.family) ?
-    "projects/${data.google_compute_image.slurm.project}/global/images/family" :
-    "projects/${data.google_compute_image.slurm.project}/global/images"
-  )
-  source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : ""
-  source_image        = can(var.instance_image.name) ? data.google_compute_image.slurm.name : ""
-}
-
-data "google_compute_image" "slurm" {
-  family  = try(var.instance_image.family, null)
-  name    = try(var.instance_image.name, null)
-  project = var.instance_image.project
-
-  lifecycle {
-    precondition {
-      condition     = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0
-      error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID."
-    }
-
-    postcondition {
-      condition     = var.instance_image_custom || contains(keys(local.known_project_families), self.project)
-      error_message = <<-EOD
-      Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image.
-      EOD
-    }
-    postcondition {
-      condition     = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false)
-      error_message = <<-EOD
-      Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases:
-      ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])}
-      EOD
-    }
-    postcondition {
-      condition     = var.disk_size_gb >= self.disk_size_gb
-      error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size"
-    }
-    postcondition {
-      # Condition needs to check the suffix of the license, as prefix contains an API version which can change.
-      # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates
-      condition     = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")])
-      error_message = "Disabling automatic updates is not supported with the selected VM image.  More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates"
-    }
-  }
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf
deleted file mode 100644
index a86bab126f..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf
+++ /dev/null
@@ -1,429 +0,0 @@
-/**
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-# Most variables have been sourced and modified from the SchedMD/slurm-gcp
-# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-
-variable "project_id" {
-  type        = string
-  description = "Project ID to create resources in."
-}
-
-variable "labels" {
-  type        = map(string)
-  description = "Labels, provided as a map."
-  default     = {}
-}
-
-variable "disable_smt" {
-  type        = bool
-  description = "Disables Simultaneous Multi-Threading (SMT) on instance."
-  default     = true
-}
-
-variable "deployment_name" {
-  description = "Name of the deployment."
-  type        = string
-}
-
-variable "disable_login_public_ips" {
-  description = "If set to false. The login will have a random public IP assigned to it. Ignored if access_config is set."
-  type        = bool
-  default     = true
-}
-
-variable "slurm_cluster_name" {
-  type        = string
-  description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)."
-  default     = null
-}
-
-variable "controller_instance_id" {
-  description = <<-EOD
-    The server-assigned unique identifier of the controller instance. This value
-    must be supplied as an output of the controller module, typically via `use`.
-    EOD
-  type        = string
-}
-
-variable "can_ip_forward" {
-  type        = bool
-  description = "Enable IP forwarding, for NAT instances for example."
-  default     = false
-}
-
-variable "network_self_link" {
-  type        = string
-  description = "Network to deploy to. Either network_self_link or subnetwork_self_link must be specified."
-  default     = null
-}
-
-variable "subnetwork_self_link" {
-  type        = string
-  description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified."
-  default     = null
-}
-
-variable "subnetwork_project" {
-  type        = string
-  description = "The project that subnetwork belongs to."
-  default     = null
-}
-
-variable "region" {
-  type        = string
-  description = <<-EOD
-    Region where the instances should be created.
-    Note: region will be ignored if it can be extracted from subnetwork.
-    EOD
-  default     = null
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "network_ip" {
-  type        = string
-  description = "DEPRECATED: Use `static_ips` variable to assign an internal static ip address."
-  default     = null
-  validation {
-    condition     = var.network_ip == null
-    error_message = "network_ip is deprecated. Use static_ips to assign an internal static ip address."
-  }
-}
-
-variable "static_ips" {
-  type        = list(string)
-  description = "List of static IPs for VM instances."
-  default     = []
-}
-
-variable "access_config" {
-  description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet."
-  type = list(object({
-    nat_ip       = string
-    network_tier = string
-  }))
-  default = []
-}
-
-variable "zone" {
-  type        = string
-  description = <<-EOD
-    Zone where the instances should be created. If not specified, instances will be
-    spread across available zones in the region.
-    EOD
-  default     = null
-}
-
-variable "metadata" {
-  type        = map(string)
-  description = "Metadata, provided as a map."
-  default     = {}
-}
-
-variable "tags" {
-  type        = list(string)
-  description = "Network tag list."
-  default     = []
-}
-
-variable "machine_type" {
-  type        = string
-  description = "Machine type to create."
-  default     = "n2-standard-2"
-}
-
-variable "min_cpu_platform" {
-  type        = string
-  description = <<-EOD
-    Specifies a minimum CPU platform. Applicable values are the friendly names of
-    CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
-    https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform
-    EOD
-  default     = null
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "gpu" {
-  type = object({
-    type  = string
-    count = number
-  })
-  description = "DEPRECATED: use var.guest_accelerator"
-  default     = null
-  validation {
-    condition     = var.gpu == null
-    error_message = "var.gpu is deprecated. Use var.guest_accelerator."
-  }
-}
-
-variable "guest_accelerator" {
-  description = "List of the type and count of accelerator cards attached to the instance."
-  type = list(object({
-    type  = string,
-    count = number
-  }))
-  default  = []
-  nullable = false
-
-  validation {
-    condition     = length(var.guest_accelerator) <= 1
-    error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node."
-  }
-}
-
-variable "service_account" {
-  type = object({
-    email  = string
-    scopes = set(string)
-  })
-  description = <<-EOD
-    Service account to attach to the login instance. If not set, the
-    default compute service account for the given project will be used with the
-    "https://www.googleapis.com/auth/cloud-platform" scope.
-    EOD
-  default     = null
-}
-
-variable "shielded_instance_config" {
-  type = object({
-    enable_integrity_monitoring = bool
-    enable_secure_boot          = bool
-    enable_vtpm                 = bool
-  })
-  description = <<-EOD
-    Shielded VM configuration for the instance. Note: not used unless
-    enable_shielded_vm is 'true'.
-    - enable_integrity_monitoring : Compare the most recent boot measurements to the
-      integrity policy baseline and return a pair of pass/fail results depending on
-      whether they match or not.
-    - enable_secure_boot : Verify the digital signature of all boot components, and
-      halt the boot process if signature verification fails.
-    - enable_vtpm : Use a virtualized trusted platform module, which is a
-      specialized computer chip you can use to encrypt objects like keys and
-      certificates.
-    EOD
-  default = {
-    enable_integrity_monitoring = true
-    enable_secure_boot          = true
-    enable_vtpm                 = true
-  }
-}
-
-variable "enable_confidential_vm" {
-  type        = bool
-  description = "Enable the Confidential VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "enable_shielded_vm" {
-  type        = bool
-  description = "Enable the Shielded VM configuration. Note: the instance image must support option."
-  default     = false
-}
-
-variable "preemptible" {
-  type        = bool
-  description = "Allow the instance to be preempted."
-  default     = false
-}
-
-variable "on_host_maintenance" {
-  type        = string
-  description = "Instance availability Policy."
-  default     = "MIGRATE"
-}
-
-variable "enable_oslogin" {
-  type        = bool
-  description = <<-EOD
-    Enables Google Cloud os-login for user login and authentication for VMs.
-    See https://cloud.google.com/compute/docs/oslogin
-    EOD
-  default     = true
-}
-
-variable "num_instances" {
-  type        = number
-  description = "Number of instances to create. This value is ignored if static_ips is provided."
-  default     = 1
-}
-
-variable "startup_script" {
-  description = "Startup script that will be used by the login node VM."
-  type        = string
-  default     = ""
-}
-
-variable "instance_template" {
-  description = <<-EOD
-    Self link to a custom instance template. If set, other VM definition
-    variables such as machine_type and instance_image will be ignored in favor
-    of the provided instance template.
-
-    For more information on creating custom images for the instance template
-    that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
-    in docs/vm-images.md.
-    EOD
-  type        = string
-  default     = null
-}
-
-variable "instance_image" {
-  description = <<-EOD
-    Defines the image that will be used in the Slurm login node VM instances.
-
-    Expected Fields:
-    name: The name of the image. Mutually exclusive with family.
-    family: The image family to use. Mutually exclusive with name.
-    project: The project where the image is hosted.
-
-    For more information on creating custom images that comply with Slurm on GCP
-    see the "Slurm on GCP Custom Images" section in docs/vm-images.md.
-    EOD
-  type        = map(string)
-  default = {
-    project = "schedmd-slurm-public"
-    family  = "slurm-gcp-5-12-hpc-centos-7"
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.project))
-    error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID."
-  }
-
-  validation {
-    condition     = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family))
-    error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name."
-  }
-}
-
-variable "instance_image_custom" {
-  description = <<-EOD
-    A flag that designates that the user is aware that they are requesting
-    to use a custom and potentially incompatible image for this Slurm on
-    GCP module.
-
-    If the field is set to false, only the compatible families and project
-    names will be accepted.  The deployment will fail with any other image
-    family or name.  If set to true, no checks will be done.
-
-    See: https://goo.gle/hpc-slurm-images
-    EOD
-  type        = bool
-  default     = false
-}
-
-
-variable "allow_automatic_updates" {
-  description = <<-EOT
-  If false, disables automatic system package updates on the created instances.  This feature is
-  only available on supported images (or images derived from them).  For more details, see
-  https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates
-  EOT
-  type        = bool
-  default     = true
-  nullable    = false
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_project" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_project == null
-    error_message = "Variable `source_image_project` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image_family" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image_family == null
-    error_message = "Variable `source_image_family` is deprecated. Use `instance_image` instead."
-  }
-}
-
-# tflint-ignore: terraform_unused_declarations
-variable "source_image" {
-  type        = string
-  description = "DEPRECATED: Use `instance_image` instead."
-  default     = null
-  validation {
-    condition     = var.source_image == null
-    error_message = "Variable `source_image` is deprecated. Use `instance_image` instead."
-  }
-}
-
-variable "disk_type" {
-  type        = string
-  description = "Boot disk type."
-  default     = "pd-standard"
-}
-
-variable "disk_size_gb" {
-  type        = number
-  description = "Boot disk size in GB."
-  default     = 50
-}
-
-variable "disk_auto_delete" {
-  type        = bool
-  description = "Whether or not the boot disk should be auto-deleted."
-  default     = true
-}
-
-variable "disk_labels" {
-  description = "Labels specific to the boot disk. These will be merged with var.labels."
-  type        = map(string)
-  default     = {}
-}
-
-variable "additional_disks" {
-  type = list(object({
-    disk_name    = string
-    device_name  = string
-    disk_type    = string
-    disk_size_gb = number
-    disk_labels  = map(string)
-    auto_delete  = bool
-    boot         = bool
-  }))
-  description = "List of maps of disks."
-  default     = []
-}
-
-variable "enable_reconfigure" {
-  description = <<EOD
-Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
-slurm.conf.tpl, partition details).
-
-NOTE: Requires Google Pub/Sub API.
-EOD
-  type        = bool
-  default     = false
-}
-
-variable "pubsub_topic" {
-  description = <<EOD
-The cluster pubsub topic created by the controller when enable_reconfigure=true.
-EOD
-  type        = string
-  default     = null
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf
deleted file mode 100644
index 2e7fbbbc2c..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2022 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-terraform {
-  required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = ">= 3.83"
-    }
-  }
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.44.0"
-  }
-  required_version = ">= 1.1"
-}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index d0f67b04f9..e527a6bc50 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -195,7 +195,7 @@ If the largest partition was 200 nodes, configure the blueprint as follows:
 
 ```yaml
   - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     ...
     settings:
       cloud_parameters:
@@ -206,13 +206,6 @@ The default has been set to 128. Values above this have not been fully tested
 and may cause congestion on the controller. A more scalable solution is under
 way.
 
-## Hybrid Slurm Clusters
-For more information on how to configure an on premise slurm cluster with hybrid
-cloud partitions, see the [schedmd-slurm-gcp-v5-hybrid] module and our
-extended instructions in our [docs](../../../../docs/hybrid-slurm-cluster/).
-
-[schedmd-slurm-gcp-v5-hybrid]: ../schedmd-slurm-gcp-v5-hybrid/README.md
-
 ## Support
 The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform
 modules. For support with the underlying modules, see the instructions in the
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
index 023f4d161b..909cec00b0 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
@@ -3,7 +3,7 @@
 This module creates a login node for a Slurm cluster based on the
 [slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance]
 terraform modules. The login node is used in conjunction with the
-[Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md).
+[Slurm controller](../schedmd-slurm-gcp-v6-controller/README.md).
 
 [slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.6
 [slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.6/terraform/slurm_cluster/modules/slurm_login_instance
diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md
index 68fc68ee80..01d3e6d389 100644
--- a/community/modules/scripts/spack-setup/README.md
+++ b/community/modules/scripts/spack-setup/README.md
@@ -59,7 +59,7 @@ OR
     source: community/modules/scripts/spack-setup
 
   - id: slurm_controller
-    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use: [network1, partition1, spack-setup]
 ```
 
diff --git a/docs/gpu-support.md b/docs/gpu-support.md
index 633813b062..cda14d7757 100644
--- a/docs/gpu-support.md
+++ b/docs/gpu-support.md
@@ -6,15 +6,14 @@
   * HTCondor modules including [htcondor-install], [htcondor-setup] and
     [htcondor-execute-point].
   * [omnia-install]
-* Slurm on GCP modules where applicable, both version 5 and version 6
-  * `schedmd-slurm-gcp-v5-*`
+* Slurm on GCP modules version 6
   * `schedmd-slurm-gcp-v6-*`
 * PBS Pro modules (`pbspro-*`)
 * Cloud Batch modules through custom instance templates
 
 ## Accelerator definition automation
 
-The schedmd-slurm-gcp-v5 modules ([node-group], [controller] and [login]),
+The schedmd-slurm-gcp-v6 modules ([nodeset], [controller] and [login]),
 the [vm-instance] module and any module relying on [vm-instance] (HTCondor,
 Omnia, PBS Pro) support
 automation for defining the `guest_accelerator` config. If the user supplies any
@@ -42,9 +41,9 @@ also have guest accelerators attached, however the type and count
 cannot be determined automatically like with `a2`.
 
 [vm-instance]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/compute/vm-instance
-[node-group]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/schedmd-slurm-gcp-v5-node-group
-[controller]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-[login]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-login
+[nodeset]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+[controller]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+[login]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v6-login
 [omnia-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/omnia-install
 [htcondor-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/htcondor-install
 [htcondor-setup]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-setup
@@ -62,14 +61,14 @@ cannot be determined automatically like with `a2`.
 ### Slurm on GCP
 
 When deploying a Slurm cluster with GPUs, we highly recommend using the
-modules based on Slurm on GCP version 5 (`schedmd-slurm-gcp-v5-*`). The
+modules based on Slurm on GCP version 5 (`schedmd-slurm-gcp-v6-*`). The
 interface is more consistent with Cluster Toolkit standards and more functionality
 is available to support, debug and workaround any issues related to GPU
 resources.
 
 #### Interface Considerations
 
-The Slurm on GCP v5 Cluster Toolkit modules (`schedmd-slurm-gcp-v5-*`) have two
+The Slurm on GCP v6 Cluster Toolkit modules (`schedmd-slurm-gcp-v6-*`) have two
 variables that can be used to define attached GPUs. The variable
 `guest_accelerators` is the recommended option as it is consistent with other
 modules in the Cluster Toolkit. The setting `gpus` can be set as well, which
@@ -81,16 +80,16 @@ provides consistency with the underlying terraform modules from the
 As mentioned above, VMs with many guest accelerators can take longer to deploy.
 Slurm sets timeouts for creating VMs, and it's possible for high GPU
 configurations to push past the default timeout. We recommend using the Slurm on
-GCP v5 modules.
+GCP v6 modules.
 
-The v5 Toolkit modules (`schedmd-slurm-gcp-v5-*`) allow Slurm configuration
+The v6 Toolkit modules (`schedmd-slurm-gcp-v6-*`) allow Slurm configuration
 timeouts to customized via the [cloud_parameters] variable on the [controller].
 See the example below which increases the `resume_timeout` from the default of
 300s to 600s:
 
 ```yaml
 - id: slurm_controller
-  source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
+  source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
   use: [...]
   settings:
     cloud_parameters:
@@ -133,7 +132,7 @@ information, see the SchedMD documentation:
 * [sbatch Documentation](https://slurm.schedmd.com/sbatch.html)
 
 [slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp
-[cloud_parameters]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_cloud_parameters
+[cloud_parameters]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v6-controller#input_cloud_parameters
 
 ## Further Reading
 
diff --git a/docs/slurm-troubleshooting.md b/docs/slurm-troubleshooting.md
index 1d39f17fce..a354ff4b57 100644
--- a/docs/slurm-troubleshooting.md
+++ b/docs/slurm-troubleshooting.md
@@ -1,58 +1,5 @@
 ## Slurm Troubleshooting
 
-### Network is unreachable (Slurm V5)
-
-Slurm requires access to google APIs to function. This can be achieved through one of the following methods:
-
-1. Create a [Cloud NAT](https://cloud.google.com/nat) (preferred).
-2. Setting `disable_controller_public_ips: false` &
-   `disable_login_public_ips: false` on the controller and login nodes
-   respectively.
-3. Enable
-   [private access to Google APIs](https://cloud.google.com/vpc/docs/private-access-options).
-
-By default the Toolkit VPC module will create an associated Cloud NAT so this is
-typically seen when working with the pre-existing-vpc module. If no access
-exists you will see the following errors:
-
-When you ssh into the login node or controller you will see the following
-message:
-
-```text
-*** Slurm setup failed! Please view log: /slurm/scripts/setup.log ***
-```
-
-> **_NOTE:_**: Many different potential issues could be indicated by the above
-> message, so be sure to verify issue in logs.
-
-To confirm the issue, ssh onto the controller and call `sudo cat /slurm/scripts/setup.log`. Look for
-the following logs:
-
-```text
-google_metadata_script_runner: startup-script: ERROR: [Errno 101] Network is unreachable
-google_metadata_script_runner: startup-script: OSError: [Errno 101] Network is unreachable
-google_metadata_script_runner: startup-script: ERROR: Aborting setup...
-google_metadata_script_runner: startup-script exit status 0
-google_metadata_script_runner: Finished running startup scripts.
-```
-
-You may also notice mount failure logs on the login node:
-
-```text
-INFO: Waiting for '/usr/local/etc/slurm' to be mounted...
-INFO: Waiting for '/home' to be mounted...
-INFO: Waiting for '/opt/apps' to be mounted...
-INFO: Waiting for '/etc/munge' to be mounted...
-ERROR: mount of path '/usr/local/etc/slurm' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/usr/local/etc/slurm']' returned non-zero exit status 32.
-ERROR: mount of path '/opt/apps' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/opt/apps']' returned non-zero exit status 32.
-ERROR: mount of path '/home' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/home']' returned non-zero exit status 32.
-ERROR: mount of path '/etc/munge' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/etc/munge']' returned non-zero exit status 32.
-```
-
-> **_NOTE:_**: The above logs only indicate that something went wrong with the
-> startup of the controller. Check logs on the controller to be sure it is a
-> network issue.
-
 ### Failure to Create Auto Scale Nodes (Slurm)
 
 If your deployment succeeds but your jobs fail with the following error:
@@ -213,9 +160,9 @@ After creating the service account, it can be set via the
 [slurm-on-gcp-con]: community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
 [slurm-on-gcp-login]: community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
 
-### Timeout Error / Startup Script Failure (Slurm V5)
+### Timeout Error / Startup Script Failure (Slurm V6)
 
-If you observe failure of startup scripts in version 5 of the Slurm module,
+If you observe failure of startup scripts in version 6 of the Slurm module,
 they may be due to a 300 second maximum timeout on scripts. All startup script
 logging is found in `/slurm/scripts/setup.log` on every node in a Slurm cluster.
 The error will appear similar to:
@@ -237,7 +184,7 @@ to execute scripts of significant duration. This pattern is demonstrated in the
 
 ### Slurm Controller Startup Fails with `exportfs` Error
 
-Example error in `/slurm/scripts/setup.log` (on Slurm V5 controller):
+Example error in `/slurm/scripts/setup.log` (on Slurm V6 controller):
 
 ```text
 exportfs: /****** does not support NFS export
@@ -262,9 +209,9 @@ the `local_mount` and `filestore_share_name`.
 
 ### `local-exec provisioner error` During Terraform Apply
 
-Using the `enable_reconfigure` setting with Slurm v5 modules uses `local-exec`
+Using the `enable_reconfigure` setting with Slurm v6 modules uses `local-exec`
 provisioners to perform additional cluster configuration. Some common issues
 experienced when using this feature are missing local python requirements and
 incorrectly configured gcloud cli. There is more information about these issues
 and fixes on the
-[`schedmd-slurm-gcp-v5-controller` documentation](../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md#live-cluster-reconfiguration-enable_reconfigure).
+[`schedmd-slurm-gcp-v6-controller` documentation](../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md#live-cluster-reconfiguration-enable_reconfigure).
diff --git a/docs/vm-images.md b/docs/vm-images.md
index c56f4ac0ac..c8bf56b644 100644
--- a/docs/vm-images.md
+++ b/docs/vm-images.md
@@ -28,7 +28,7 @@ Please see the [blueprint catalog](https://cloud.google.com/hpc-toolkit/docs/set
 > documentation for any module utilized.
 
 When an Cluster Toolkit blueprint points to a predefined source module (e.g.
-`community/modules/compute/schedmd-slurm-gcp-v5-node-group`), generally the
+`community/modules/compute/schedmd-slurm-gcp-v6-nodeset`), generally the
 module has a default image defined. In order to override this default image, a
 user may specify the `instance_image` setting in the yaml blueprint, within
 either the specific module definition or the global variables. The
@@ -159,15 +159,6 @@ description of our support for Windows images.
   <td><a href="../tools/validate_configs/os_compatibility_tests/batch-startup.yaml">✓</a></td>
   <td><a href="../tools/validate_configs/os_compatibility_tests/batch-startup.yaml">✓</a></td>
 </tr>
-
-<tr>
-  <th rowspan="4">Slurm</th>
-  <th>Chrome Remote Desktop</th>
-  <th></th>
-  <td><a href="../community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml">✓</a></td>
-  <td></td>
-  <td></td>
-</tr>
 <tr>
   <th>Lustre</th>
   <th></th>
@@ -260,7 +251,7 @@ Packer modules. For example, images built for version 5.8 are compatible with
 all Terraform modules from 5.8.0 but below 5.9.0. The version of the Slurm
 modules used by your copy of the Toolkit in the local filesystem can be
 inspected by looking for the source line in
-`community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf`.
+`community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf`.
 
 The latest GitHub release supports
 [these images][slurm-gcp-published-images].
@@ -280,21 +271,21 @@ and [Image Builder](../examples/README.md#image-builderyaml-core-badge)
 
 These instructions apply to the following modules:
 
-* [schedmd-slurm-gcp-v5-controller]
-* [schedmd-slurm-gcp-v5-login]
-* [schedmd-slurm-gcp-v5-node-group]
+* [schedmd-slurm-gcp-v6-controller]
+* [schedmd-slurm-gcp-v6-login]
+* [schedmd-slurm-gcp-v6-nodeset]
 
-[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5
-[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5/packer
+[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master
+[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/packer
 [slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md
 [slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
 [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create
 
 [vm-instance]: ../modules/compute/vm-instance
 [hpc-toolkit-packer]: ../modules/packer/custom-image
-[schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller
-[schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login
-[schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group
+[schedmd-slurm-gcp-v6-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+[schedmd-slurm-gcp-v6-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-login
+[schedmd-slurm-gcp-v6-nodeset]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset
 [batch-job]: ../modules/scheduler/batch-job-template
 [batch-login]: ../modules/scheduler/batch-login-node
 [htcondor-setup]: ../community/modules/scheduler/htcondor-setup
@@ -307,7 +298,6 @@ These instructions apply to the following modules:
 [vm-crd.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-crd.yaml
 [vm-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-filestore.yaml
 [vm-lustre.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-lustre.yaml
-[hpc-slurm-chromedesktop-v5-legacy.yaml]: ../community/examples/hpc-slurm-chromedesktop.yaml
 [slurm-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml
 [batch-startup.yaml]: ../tools/validate_configs/os_compatibility_tests/batch-startup.yaml
 [batch-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/batch-filestore.yaml
diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml
index e03daa3d27..afa659d563 100644
--- a/examples/hpc-enterprise-slurm.yaml
+++ b/examples/hpc-enterprise-slurm.yaml
@@ -217,7 +217,7 @@ deployment_groups:
       node_count_dynamic_max: 16
       machine_type: a2-megagpu-16g
       # This makes this nodeset look for machines in any of the following zones
-      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies // !!!
+      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v6-nodeset#compute-vm-zone-policies // !!!
       zones: $(vars.gpu_zones)
       bandwidth_tier: gvnic_enabled
       instance_image: $(vars.slurm_image)
diff --git a/modules/README.md b/modules/README.md
index 1f89107f0b..f326bda54f 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -35,10 +35,6 @@ Modules that are still in development and less stable are labeled with the
 ### Compute
 
 * **[vm-instance]** ![core-badge] : Creates one or more VM instances.
-* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] ![deprecated-badge] :
-  Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v5-controller].
-* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] ![deprecated-badge]:
-  Creates a node group to be used by the [schedmd-slurm-gcp-v5-partition] module.
 * **[schedmd-slurm-gcp-v6-partition]** ![core-badge] :
   Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v6-controller].
 * **[schedmd-slurm-gcp-v6-nodeset]** ![core-badge] :
@@ -65,8 +61,6 @@ Modules that are still in development and less stable are labeled with the
 [gke-node-pool]: ../modules/compute/gke-node-pool/README.md
 [resource-policy]: ../modules/compute/resource-policy/README.md
 [gke-job-template]: ../modules/compute/gke-job-template/README.md
-[schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
-[schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
 [schedmd-slurm-gcp-v6-partition]: ../community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md
 [schedmd-slurm-gcp-v6-nodeset]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
 [schedmd-slurm-gcp-v6-nodeset-tpu]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
@@ -188,12 +182,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca
 * **[gke-cluster]** ![core-badge] ![experimental-badge] : Creates a
   Kubernetes cluster using GKE.
 * **[pre-existing-gke-cluster]** ![core-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module.
-* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] ![deprecated-badge] :
-  Creates a Slurm controller node using [slurm-gcp-version-5].
-* **[schedmd-slurm-gcp-v5-login]** ![community-badge] ![deprecated-badge] :
-  Creates a Slurm login node using [slurm-gcp-version-5].
-* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] ![deprecated-badge] :
-  Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5].
 * **[schedmd-slurm-gcp-v6-controller]** ![core-badge] :
   Creates a Slurm controller node using [slurm-gcp-version-6].
 * **[schedmd-slurm-gcp-v6-login]** ![core-badge] :
@@ -220,10 +208,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca
 [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md
 [schedmd-slurm-gcp-v6-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
 [schedmd-slurm-gcp-v6-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
-[schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
-[schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
-[schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
-[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2
 [slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.6
 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md
 [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md
@@ -278,10 +262,14 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca
 [spack-execute]: ../community/modules/scripts/spack-execute/README.md
 [wait-for-startup]: ../community/modules/scripts/wait-for-startup/README.md
 
-> **_NOTE:_** Slurm V4 is deprecated. In case, you want to use V4 modules, please use
+> **_NOTE:_** Slurm-GCP V4 is deprecated. In case, you want to use V4 modules, please use
 [ghpc-v1.27.0](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.27.0)
 source code and build ghpc binary from this. This source code also contains
 deprecated examples using V4 modules for your reference.
+> **_NOTE:_** Slurm-GCP V5 is deprecated. In case, you want to use V5 modules, please use
+[ghpc-v1.44.1](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.44.1)
+source code and build ghpc binary from this. This source code also contains
+deprecated examples using V5 modules for your reference.
 
 ## Module Fields
 
diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go
index 7f4e22c1ec..37d5c1fb0b 100644
--- a/pkg/modulereader/metadata_legacy.go
+++ b/pkg/modulereader/metadata_legacy.go
@@ -49,9 +49,6 @@ func defaultAPIList(source string) []string {
 			"compute.googleapis.com",
 			"storage.googleapis.com",
 		},
-		"community/modules/compute/schedmd-slurm-gcp-v5-partition": {
-			"compute.googleapis.com",
-		},
 		"community/modules/database/slurm-cloudsql-federation": {
 			"bigqueryconnection.googleapis.com",
 			"sqladmin.googleapis.com",
@@ -115,19 +112,6 @@ func defaultAPIList(source string) []string {
 			"compute.googleapis.com",
 			"storage.googleapis.com",
 		},
-		"community/modules/scheduler/schedmd-slurm-gcp-v5-controller": {
-			"compute.googleapis.com",
-			"iam.googleapis.com",
-			"pubsub.googleapis.com",
-			"secretmanager.googleapis.com",
-		},
-		"community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid": {
-			"compute.googleapis.com",
-			"pubsub.googleapis.com",
-		},
-		"community/modules/scheduler/schedmd-slurm-gcp-v5-login": {
-			"compute.googleapis.com",
-		},
 		"community/modules/scripts/htcondor-install": {},
 		"community/modules/scripts/omnia-install":    {},
 		"community/modules/scripts/pbspro-preinstall": {
diff --git a/tools/cloud-build/project-cleanup-slurm.yaml b/tools/cloud-build/project-cleanup-slurm.yaml
index dbd986fd89..e45d4fa8a3 100644
--- a/tools/cloud-build/project-cleanup-slurm.yaml
+++ b/tools/cloud-build/project-cleanup-slurm.yaml
@@ -36,9 +36,8 @@ steps:
         fi
 
         # look only for tests that either use Slurm5, or Slurm6
-        # v5: clean project metadata
-        # v5+v6: clean resource policies
-        builds_filter="tags=m.schedmd-slurm-gcp-v6-controller OR tags=m.schedmd-slurm-gcp-v5-controller"
+        # v6: clean resource policies
+        builds_filter="tags=m.schedmd-slurm-gcp-v6-controller"
         builds_format="value(substitutions.TRIGGER_NAME,logUrl)"
         active_builds=$(gcloud builds list --project "${PROJECT_ID}" --filter="${builds_filter}" --format="${builds_format}" --ongoing 2>/dev/null)
         if [[ -n "$active_builds" ]]; then
diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py
index dd221801ac..a1ab586ef3 100644
--- a/tools/duplicate-diff.py
+++ b/tools/duplicate-diff.py
@@ -35,21 +35,11 @@
         "modules/scheduler/batch-job-template/startup_from_network_storage.tf",
         "modules/compute/vm-instance/startup_from_network_storage.tf",
     ],
-    [
-        "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf",
-    ],
     [
         "modules/compute/gke-node-pool/threads_per_core_calc.tf",
         "modules/compute/vm-instance/threads_per_core_calc.tf",
     ],
-    [ # Slurm V5
-        "community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf",
-        "community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf",
-    ],
-    [ # Slurm V6
+    [
         "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf",
         "community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf",
         "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf",
@@ -68,7 +58,6 @@
         "community/modules/scripts/ramble-setup/scripts/install_ramble_deps.yml",
     ],
     [
-        "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl",
         "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl",
     ],
     [
diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
index 711c9f72e0..8ca6f6994d 100644
--- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
@@ -217,7 +217,7 @@ deployment_groups:
       node_count_dynamic_max: 16
       machine_type: a2-megagpu-16g
       # This makes this nodeset look for machines in any of the following zones
-      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies // !!!
+      # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v6-nodeset#compute-vm-zone-policies // !!!
       zones: $(vars.gpu_zones)
       bandwidth_tier: gvnic_enabled
       instance_image: $(vars.slurm_image)
diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh
index 57929a4ded..b00a2e36b6 100755
--- a/tools/validate_configs/validate_configs.sh
+++ b/tools/validate_configs/validate_configs.sh
@@ -121,12 +121,20 @@ check_background() {
 }
 
 CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/machine-learning/a3-ultragpu-8g/*' -not -path 'examples/gke-a3-ultragpu/*' -not -path 'examples/hypercompute_clusters/*')
+# Exclude blueprints that use v5 modules.
+declare -A EXCLUDE_EXAMPLE
+EXCLUDE_EXAMPLE["tools/validate_configs/test_configs/two-clusters-sql.yaml"]=
 
 cwd=$(pwd)
 NPROCS=${NPROCS:-$(nproc)}
 echo "Running tests in $NPROCS processes"
 pids=()
 for example in $CONFIGS; do
+	if [[ ${EXCLUDE_EXAMPLE[$example]+_} ]]; then
+		echo "Skipping example: $example"
+		continue
+	fi
+
 	JNUM=$(jobs | wc -l)
 	# echo "$JNUM jobs running"
 	if [ "$JNUM" -ge "$NPROCS" ]; then

From ebd367290e756a2a8c6e96b683d00a8a96b0d438 Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Thu, 9 Jan 2025 14:30:04 +0000
Subject: [PATCH 125/140] update provider google-beta related info

---
 .../modules/file-system/cloud-storage-bucket/README.md     | 4 +++-
 community/modules/file-system/cloud-storage-bucket/main.tf | 1 +
 .../modules/file-system/cloud-storage-bucket/versions.tf   | 7 +++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/community/modules/file-system/cloud-storage-bucket/README.md b/community/modules/file-system/cloud-storage-bucket/README.md
index 3d85b38b0d..ae01af9f42 100644
--- a/community/modules/file-system/cloud-storage-bucket/README.md
+++ b/community/modules/file-system/cloud-storage-bucket/README.md
@@ -109,6 +109,7 @@ limitations under the License.
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.14.0 |
 | <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.83 |
+| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 6.13.0 |
 | <a name="requirement_random"></a> [random](#requirement\_random) | ~> 3.0 |
 
 ## Providers
@@ -116,6 +117,7 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | >= 3.83 |
+| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 6.13.0 |
 | <a name="provider_random"></a> [random](#provider\_random) | ~> 3.0 |
 
 ## Modules
@@ -126,7 +128,7 @@ No modules.
 
 | Name | Type |
 |------|------|
-| [google_storage_bucket.bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource |
+| [google-beta_google_storage_bucket.bucket](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_storage_bucket) | resource |
 | [google_storage_bucket_iam_binding.viewers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam_binding) | resource |
 | [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
 
diff --git a/community/modules/file-system/cloud-storage-bucket/main.tf b/community/modules/file-system/cloud-storage-bucket/main.tf
index 60822cde1f..684fcbfa89 100644
--- a/community/modules/file-system/cloud-storage-bucket/main.tf
+++ b/community/modules/file-system/cloud-storage-bucket/main.tf
@@ -34,6 +34,7 @@ resource "random_id" "resource_name_suffix" {
 }
 
 resource "google_storage_bucket" "bucket" {
+  provider                    = google-beta
   project                     = var.project_id
   name                        = local.name
   uniform_bucket_level_access = true
diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf
index 72ccbf44bd..f2a338d5a1 100644
--- a/community/modules/file-system/cloud-storage-bucket/versions.tf
+++ b/community/modules/file-system/cloud-storage-bucket/versions.tf
@@ -20,6 +20,10 @@ terraform {
       source  = "hashicorp/google"
       version = ">= 3.83"
     }
+    google-beta = {
+      source  = "hashicorp/google-beta"
+      version = ">= 6.13.0"
+    }
     random = {
       source  = "hashicorp/random"
       version = "~> 3.0"
@@ -28,5 +32,8 @@ terraform {
   provider_meta "google" {
     module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.44.0"
   }
+  provider_meta "google-beta" {
+    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.44.0"
+  }
   required_version = ">= 0.14.0"
 }

From c8772294bbce646a21cbb0a2726f105b2bf8604a Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Thu, 9 Jan 2025 17:20:53 +0000
Subject: [PATCH 126/140] readme update of google-beta version

---
 community/modules/file-system/cloud-storage-bucket/README.md  | 4 ++--
 .../modules/file-system/cloud-storage-bucket/versions.tf      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/community/modules/file-system/cloud-storage-bucket/README.md b/community/modules/file-system/cloud-storage-bucket/README.md
index ae01af9f42..ba1d88c6e8 100644
--- a/community/modules/file-system/cloud-storage-bucket/README.md
+++ b/community/modules/file-system/cloud-storage-bucket/README.md
@@ -109,7 +109,7 @@ limitations under the License.
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.14.0 |
 | <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.83 |
-| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 6.13.0 |
+| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 6.9.0 |
 | <a name="requirement_random"></a> [random](#requirement\_random) | ~> 3.0 |
 
 ## Providers
@@ -117,7 +117,7 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | >= 3.83 |
-| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 6.13.0 |
+| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 6.9.0 |
 | <a name="provider_random"></a> [random](#provider\_random) | ~> 3.0 |
 
 ## Modules
diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf
index f2a338d5a1..fb28c00932 100644
--- a/community/modules/file-system/cloud-storage-bucket/versions.tf
+++ b/community/modules/file-system/cloud-storage-bucket/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 6.13.0"
+      version = ">= 6.9.0"
     }
     random = {
       source  = "hashicorp/random"

From 440ffa2f252efd32c3f28e2b76440714cb254927 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 17:31:07 +0000
Subject: [PATCH 127/140] Update dependabot version update configuration

We have disabled security updates at repository level because they are
targeting the main branch and often produce PRs that are duplicative of
work already performed on develop branch. Security alert notifications
remain enabled. In its place, we are re-enabling monthly updates for all
configured package ecosystems, but with minor and patch updates grouped
together.
---
 .github/dependabot.yml | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a4a10fee69..a0d2804978 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -30,7 +30,16 @@ updates:
   target-branch: develop
   ignore:
   - dependency-name: "google.golang.org/api"
-
+  groups:
+  # group all Go minor/patch updates together and individual PRs for major updates
+  # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/optimizing-pr-creation-version-updates#example-3-individual-pull-requests-for-major-updates-and-grouped-for-minorpatch-updates
+    go-minor-and-patch-updates:
+      applies-to: version-updates
+      patterns:
+      - "*"
+      update-types:
+      - minor
+      - patch
 - package-ecosystem: pip
   directory: /community/front-end/ofe/
   labels:
@@ -45,9 +54,16 @@ updates:
   reviewers:
   - ek-nag
   - mattstreet-nag
-  # Disable version updates, do security updates only
-  # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
-  open-pull-requests-limit: 0
+  groups:
+    # group all OFE minor/patch updates together and individual PRs for major updates
+    # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/optimizing-pr-creation-version-updates#example-3-individual-pull-requests-for-major-updates-and-grouped-for-minorpatch-updates
+    ofe-minor-and-patch-updates:
+      applies-to: version-updates
+      patterns:
+      - "*"
+      update-types:
+      - minor
+      - patch
 - package-ecosystem: pip
   directory: /community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/
   labels:
@@ -55,11 +71,18 @@ updates:
   - python
   - release-chore
   schedule:
-    interval: weekly
+    interval: monthly
     day: monday
     time: "03:00"
     timezone: America/Los_Angeles
   target-branch: develop
-  # Disable version updates, do security updates only
-  # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
-  open-pull-requests-limit: 0
+  groups:
+    # group all Slurm minor/patch updates together and individual PRs for major updates
+    # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/optimizing-pr-creation-version-updates#example-3-individual-pull-requests-for-major-updates-and-grouped-for-minorpatch-updates
+    slurm-python-minor-and-patch-updates:
+      applies-to: version-updates
+      patterns:
+      - "*"
+      update-types:
+      - minor
+      - patch

From fb5c7f49c734f1dd3f6b903a99fe81bf11502377 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 21:27:59 +0000
Subject: [PATCH 128/140] Fix assertEquals typo

---
 tools/cloud-build/daily-tests/validate_tests_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py
index c734e984e7..9f07ef7b42 100644
--- a/tools/cloud-build/daily-tests/validate_tests_metadata.py
+++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py
@@ -127,7 +127,7 @@ def check_tags(self, build_path: str) -> None:
         if missing_mod_tags:
             hint = "\n- ".join([""] + sorted(missing_mod_tags))
             self.fail(msg=f"Some used modules aren't declared\nHINT: add following tags to {build_path}: {hint}")
-        self.assertEquals(declared_mod_tags, required_mod_tags)
+        self.assertEqual(declared_mod_tags, required_mod_tags)
 
         self.assertNotEqual(tags & CATEGORICAL_TAGS, set(), msg=f"No categorical tags, pick/add one: {CATEGORICAL_TAGS}")
 

From a9847ef037c51c13eeb187bf3df324d862b16406 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 21:27:59 +0000
Subject: [PATCH 129/140] Restore a3-ultragpu Slurm and hypercompute cluster
 blueprints

---
 .../a3u-slurm-ubuntu-gcs/README.md            | 153 +++++
 .../a3u-slurm-ubuntu-gcs.yaml                 | 615 ++++++++++++++++++
 .../a3u-slurm-ubuntu-gcs/deployment.yaml      |  31 +
 .../run-nccl-tests-via-ramble.sh              | 224 +++++++
 .../machine-learning/a3-ultragpu-8g/README.md |  16 +
 .../a3ultra-slurm-blueprint.yaml              | 451 +++++++++++++
 .../a3ultra-slurm-deployment.yaml             |  26 +
 .../a3-ultragpu-8g/a3ultra-vm.yaml            | 151 +++++
 .../a3-ultragpu-8g/nccl-tests/README.md       |  89 +++
 .../nccl-tests/build-nccl-tests.sh            |  44 ++
 .../nccl-tests/import_pytorch_container.sh    |  19 +
 .../nccl-tests/run-nccl-tests.sh              |  58 ++
 .../post-destroy-tasks/delete-image.yml       |  30 +
 .../builds/ml-a3-ultragpu-slurm.yaml          |  52 ++
 .../tests/ml-a3-ultragpu-slurm.yml            |  45 ++
 15 files changed, 2004 insertions(+)
 create mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md
 create mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml
 create mode 100644 examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/README.md
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh
 create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml
 create mode 100644 tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml

diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md
new file mode 100644
index 0000000000..7f0c062080
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/README.md
@@ -0,0 +1,153 @@
+# A3-Ultra Slurm + Ubuntu + GCS
+
+This reference design creates a Slurm cluster with the following design:
+
+1. Ubuntu 22 Operating System
+1. A static a3-ultragpu-8g partition that uses a reservation.
+1. 3 VPCs (2x CPU, 1x for GPU RDMA networks), with a total of 9 subnetworks
+1. A GCS bucket that is configured with Hierarchical Namespace enabled
+1. Cloud Storage Fuse, configured to utilize Local-SSD storage
+
+## Deployment Instructions
+
+### Build the Cluster Toolkit gcluster binary
+
+Follow instructions
+[here](https://cloud.google.com/cluster-toolkit/docs/setup/configure-environment)
+
+### (Optional, but recommended) Create a GCS Bucket for storing terraform state
+
+```bash
+#!/bin/bash
+
+TF_STATE_BUCKET_NAME=<your-bucket>
+PROJECT_ID=<your-gcp-project>
+REGION=<your-preferred-region>
+
+gcloud storage buckets create gs://${TF_STATE_BUCKET_NAME} \
+    --project=${PROJECT_ID} \
+    --default-storage-class=STANDARD --location=${REGION} \
+    --uniform-bucket-level-access
+gcloud storage buckets update gs://${TF_STATE_BUCKET_NAME} --versioning
+```
+
+### Create and configure a GCS Bucket
+
+This will be used for input data and checkpoint/restart data. This bucket should
+be created with Hierarchical Namespace enabled. See
+[here](https://cloud.google.com/storage/docs/hns-overview) for more details.
+
+```bash
+#!/bin/bash
+PROJECT_ID=<your-gcp-project>
+REGION=<your-preferred-region>
+HNS_BUCKET_NAME=<training-bucket-name>
+PROJECT_NUMER=<your-project-number>
+
+gcloud storage buckets create gs://${HNS_BUCKET_NAME} \
+    --location=${REGION} --uniform-bucket-level-access
+    --enable-hierarchical-namespace
+
+```
+
+### Create/modify the deployment.yaml file with your preferred configuration
+
+For example, set the such as size, reservation to be used, etc, as well as the
+name of the bucket that you just created. Below is an example
+
+```yaml
+---
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket: TF_STATE_BUCKET_NAME
+
+vars:
+  deployment_name: a3u-gcs
+  project_id: <PROJECT_ID>
+  region: <REGION>
+  zone: <ZONE>
+  a3u_reservation_name: <RESERVATION_NAME>
+  a3u_cluster_size: <RESERVATION_SIZE>
+  hns_gcs_bucket: <HNS_BUCKET_NAME> # This bucket must have been previously created
+
+```
+
+### Deploy the cluster
+
+```bash
+#!/bin/bash
+gcluster deploy -d deployment.yaml a3u-slurm-ubuntu-gcs.yaml
+```
+
+## Storage Design Components
+
+On the login and controller nodes, the gcs bucket is mounted at /gcs, using
+fairly standard [Cloud Storage Fuse configuration](https://cloud.google.com/storage/docs/cloud-storage-fuse/config-file). On the compute nodes, there are two
+mounts of the same bucket.  First, `/gcs` is mounted with with the following
+configuration:
+
+```yaml
+file-cache:
+  max-size-mb: -1
+  enable-parallel-downloads: true
+  download-chunk-size-mb: 50
+  parallel-downloads-per-file: 16
+cache-dir: /mnt/localssd
+file-system:
+  dir-mode: "777"
+  file-mode: "777"
+  rename-dir-limit: 20000  # Set to 20000 for hierarchical buckets
+  temp-dir: /mnt/localssd
+  fuse-options: allow_other
+foreground: true
+```
+
+This uses /mnt/localssd as a cache dir (for reads) and temp-dir (for writes).
+It also enables parallel downloads, which is particularly useful for
+checkpoint restarts.
+
+Next, `/gcs-ro` is mounted in a "read-only" mode, and optimized to for
+input (training) data reading.
+
+```yaml
+file-cache:
+  max-size-mb: -1
+metadata-cache:
+  ttl-secs: 3600  # Decrease if your data changes quickly.
+cache-dir: /mnt/localssd
+file-system:
+  dir-mode: "755" # need 5 on dir to enable ls
+  file-mode: "644"
+  temp-dir: /mnt/localssd
+  fuse-options: allow_other
+  kernel-list-cache-ttl-secs: 60
+foreground: true
+```
+
+The local ssds will be used for a file cache, and the metadata-cache
+for the data is set to 1 hour, with kernel-list-cache ttl set to 60 seconds.
+This reduces the amount of requests that will be sent to GCS, and improves
+data loading performance.
+
+We suggest using /gcs for checkpoint saving/loading. and use /gcs-ro for
+data input loading.
+
+## Running Benchmarks with Ramble
+
+To run a series of NCCL test benchmarks on your cluster, you can use
+the use the following script: `run-nccl-tests-via-ramble.sh`,
+which will use [ramble](https://github.com/GoogleCloudPlatform/ramble) to
+automate the building and running of nccl tests from 2 nodes up to 32 node
+scales.
+
+Copy the contents of `run-nccl-tests-via-ramble.sh` to your slurm
+login or controller node, for example:
+
+```bash
+#!/bin/bash
+wget -np -nd https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
+```
+
+and then launch with `bash run-nccl-tests-via-ramble.sh`. The entire process
+will take ~30 minutes.
diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml
new file mode 100644
index 0000000000..7be9f89a00
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml
@@ -0,0 +1,615 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: a3u-slurm-ubuntu-gcs
+
+vars:
+  # The following are supplied through the deployment.yaml file.
+  deployment_name: # supply deployment name
+  project_id: # supply project ID
+  region: # supply region
+  zone: # supply zone
+  a3u_cluster_size: # supply cluster size
+  a3u_reservation_name: # supply reservation name
+  hns_gcs_bucket: # Name of HNS enabled GCS bucket
+  # End of variables defined by deployment.yaml. The remainder
+  # of this blueprint need not be modified.
+
+  # Image settings
+  base_image:
+    project: ubuntu-os-accelerator-images
+    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
+  image_build_machine_type: n2-standard-16
+  build_slurm_from_git_ref: 6.8.6
+
+  # Cluster env settings
+  # net0 and filestore ranges must not overlap
+  net0_range: 192.168.0.0/19
+  filestore_ip_range: 192.168.32.0/24
+  net1_range: 192.168.64.0/18
+  rdma_net_range: 192.168.128.0/18
+
+  # Cluster Settings
+  local_ssd_mountpoint: /mnt/localssd
+  instance_image:
+    project: $(vars.project_id)
+    family: $(vars.deployment_name)-u22
+  disk_size_gb: 200
+  nccl_plugin_version: v1.0.2
+
+  # Here we define a set of startup script runners that are used to configure
+  # the controller node
+  controller_runners:
+  - type: shell
+    destination: stage_scripts.sh
+    content: |
+      #!/bin/bash
+      SLURM_ROOT=/opt/apps/adm/slurm
+      PARTITION_NAME=a3ultra
+      mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
+      mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
+      ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
+
+  # Shared runners between login and controller:
+  # Configure an enroot config path
+  shared_runners:
+  - type: data
+    destination: /etc/enroot/enroot.conf
+    content: |
+      ENROOT_CONFIG_PATH     ${HOME}/.enroot
+
+  # Here we define a set of startup script runners that are used to configure
+  # the A3-Ultra nodes
+  # Set up enroot, using the local ssds for runtime/cache/data/temp storage.
+  a3u_runners:
+  - type: data
+    destination: /etc/enroot/enroot.conf
+    content: |
+      ENROOT_CONFIG_PATH     ${HOME}/.enroot
+      ENROOT_RUNTIME_PATH    $(vars.local_ssd_mountpoint)/${UID}/enroot/runtime
+      ENROOT_CACHE_PATH      $(vars.local_ssd_mountpoint)/${UID}/enroot/cache
+      ENROOT_DATA_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot/data
+      ENROOT_TEMP_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot
+
+  # Install NCCL Network Plugin
+  - type: ansible-local
+    destination: nccl_plugin.yml
+    content: |
+      ---
+      - name: Install NCCL plugin for A3 Ultra series
+        hosts: all
+        become: true
+        tasks:
+        - name: Add SystemD unit for NCCL plugin installation
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/nccl-plugin@.service
+            mode: 0o0644
+            content: |
+              [Unit]
+              After=network-online.target
+              Before=slurmd.service
+
+              [Service]
+              Type=oneshot
+              ExecStartPre=/usr/bin/rm -rf /usr/local/gib
+              ExecStartPre=/usr/bin/mkdir -p /usr/local/gib
+              ExecStartPre=/snap/bin/gcloud auth configure-docker --quiet us-docker.pkg.dev
+              ExecStart=/usr/bin/docker run --rm --name nccl-gib-installer --volume /usr/local/gib:/var/lib/gib \
+                  us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:%i install --install-nccl
+
+              [Install]
+              WantedBy=slurmd.service
+          notify:
+          - Reload SystemD
+        handlers:
+        - name: Reload SystemD
+          ansible.builtin.systemd:
+            daemon_reload: true
+        post_tasks:
+        - name: Enable NCCL plugin SystemD unit
+          ansible.builtin.service:
+            name: nccl-plugin@$(vars.nccl_plugin_version).service
+            state: started
+            enabled: true
+
+  # Configure Cloud Storage FUSE
+  - type: ansible-local
+    destination: gcsfuse.yml
+    content: |
+      ---
+      - name: Create LSSD optimized gcsfuse mount
+        hosts: all
+        become: true
+        tasks:
+        - name: Create gcsfuse rwx configuration
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse-lssd.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-cache:
+                max-size-mb: -1
+                enable-parallel-downloads: true
+                download-chunk-size-mb: 50
+                parallel-downloads-per-file: 16
+              cache-dir: /mnt/localssd
+              file-system:
+                dir-mode: "777"
+                file-mode: "777"
+                rename-dir-limit: 20000  # Set to 20000 for hierarchical buckets
+                temp-dir: /mnt/localssd
+                fuse-options: allow_other
+              foreground: true
+
+        - name: Create gcsfuse read-only configuration for input data
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse-ro.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-cache:
+                max-size-mb: -1
+              metadata-cache:
+                ttl-secs: 3600  # Decrease if your data changes quickly.
+              cache-dir: /mnt/localssd
+              file-system:
+                dir-mode: "755" # need 5 on dir to enable ls
+                file-mode: "644"
+                temp-dir: /mnt/localssd
+                fuse-options: allow_other
+                kernel-list-cache-ttl-secs: 60
+              foreground: true
+
+        - name: Create gcsfuse systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse-lssd.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse mount of all buckets
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs
+              ExecStart=gcsfuse --config-file /etc/gcsfuse-lssd.yml $(vars.hns_gcs_bucket) /gcs
+              ExecStop=fusermount3 -u /gcs
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        - name: Create read-only gcsfuse systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse-ro.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse-ro mount
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs-ro
+              ExecStart=gcsfuse --config-file /etc/gcsfuse-ro.yml $(vars.hns_gcs_bucket) /gcs-ro
+              ExecStop=fusermount3 -u /gcs-ro
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        post_tasks:
+        - name: Enable and restart gcsfuse
+          ansible.builtin.service:
+            name: gcsfuse-lssd.service
+            state: restarted
+            enabled: true
+
+        - name: Enable and restart gcsfuse-ro
+          ansible.builtin.service:
+            name: gcsfuse-ro.service
+            state: restarted
+            enabled: true
+
+  # Configure Cloud Storage FUSE for login/controller nodes
+  gcsfuse_runners:
+  - type: ansible-local
+    destination: gcsfuse.yml
+    content: |
+      ---
+      - name: Create Standard RWX gcsfuse mount
+        hosts: localhost
+        become: true
+        tasks:
+        - name: Create gcsfuse configuration
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-system:
+                dir-mode: "777"
+                file-mode: "777"
+                rename-dir-limit: 20000
+                fuse-options: allow_other
+              foreground: true
+
+        - name: Create gcsfuse systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse mount of all buckets
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs
+              ExecStart=gcsfuse --config-file /etc/gcsfuse.yml $(vars.hns_gcs_bucket) /gcs
+              ExecStop=fusermount3 -u /gcs
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        post_tasks:
+        - name: Enable and restart gcsfuse
+          ansible.builtin.service:
+            name: gcsfuse.service
+            state: restarted
+            enabled: true
+
+deployment_groups:
+- group: image-env
+  modules:
+  - id: slurm-image-network
+    source: modules/network/vpc
+
+  - id: slurm-build-script
+    source: modules/scripts/startup-script
+    settings:
+      install_ansible: true
+      docker:
+        enabled: true
+      runners:
+      - type: data
+        destination: /etc/cluster_toolkit/a3ultra-prod-slurm-image.yaml
+        source: ../.ghpc/artifacts/expanded_blueprint.yaml
+      - type: data
+        destination: /var/tmp/slurm_vars.json
+        content: |
+          {
+            "reboot": false,
+            "install_cuda": false,
+            "install_gcsfuse": true,
+            "install_lustre": false,
+            "install_ompi": true,
+            "update_kernel": false,
+            "monitoring_agent": "cloud-ops",
+          }
+      - type: shell
+        destination: install_slurm.sh
+        content: |
+          #!/bin/bash
+          set -e -o pipefail
+          ansible-pull \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C $(vars.build_slurm_from_git_ref) \
+              -i localhost, --limit localhost --connection=local \
+              -e @/var/tmp/slurm_vars.json \
+              ansible/playbook.yml
+            # this duplicates the ulimits configuration of the HPC VM Image
+      - type: data
+        destination: /etc/security/limits.d/99-unlimited.conf
+        content: |
+          * - memlock unlimited
+          * - nproc unlimited
+          * - stack unlimited
+          * - nofile 1048576
+          * - cpu unlimited
+          * - rtprio unlimited
+      - type: data
+        destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf
+        content: |
+          [Service]
+          LimitNOFILE=infinity
+      - type: data
+        destination: /etc/netplan/60-cloud-mrdma-init.yaml
+        content: |
+          network:
+            ethernets:
+              primary:
+                match:
+                  name: enp0s*
+                  driver: gve
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: true
+                dhcp6: true
+                dhcp6-overrides:
+                  use-domains: true
+                optional: true
+              secondary:
+                match:
+                  driver: gve
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: false
+                  use-dns: false
+                  use-ntp: false
+                dhcp6: true
+                dhcp6-overrides:
+                  use-domains: false
+                  use-dns: false
+                  use-ntp: false
+                optional: true
+              mrdma_devices:
+                match:
+                  driver: mlx5_core
+                dhcp-identifier: mac
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: true
+                  use-dns: false
+                  use-ntp: false
+                optional: true
+            version: 2
+      - type: ansible-local
+        destination: configure_gpu.yml
+        content: |
+          ---
+          - name: Install NVIDIA packages
+            hosts: all
+            become: true
+            vars:
+              distribution: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.','') }}"
+              cuda_repo_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ distribution }}/x86_64/cuda-keyring_1.1-1_all.deb
+              cuda_repo_filename: /tmp/{{ cuda_repo_url | basename }}
+              enable_nvidia_dcgm: false
+              nvidia_packages:
+              - cuda-toolkit-12-4
+              - datacenter-gpu-manager
+              - libnvidia-nscq-550
+            tasks:
+            - name: Download NVIDIA repository package
+              ansible.builtin.get_url:
+                url: "{{ cuda_repo_url }}"
+                dest: "{{ cuda_repo_filename }}"
+            - name: Install NVIDIA repository package
+              ansible.builtin.apt:
+                deb: "{{ cuda_repo_filename }}"
+                state: present
+            - name: Reduce NVIDIA repository priority
+              ansible.builtin.copy:
+                dest: /etc/apt/preferences.d/cuda-repository-pin-600
+                mode: 0o0644
+                owner: root
+                group: root
+                content: |
+                  Package: nsight-compute
+                  Pin: origin *ubuntu.com*
+                  Pin-Priority: -1
+
+                  Package: nsight-systems
+                  Pin: origin *ubuntu.com*
+                  Pin-Priority: -1
+
+                  Package: *
+                  Pin: release l=NVIDIA CUDA
+                  Pin-Priority: 400
+            - name: Install NVIDIA fabric and CUDA
+              ansible.builtin.apt:
+                name: "{{ item }}"
+                update_cache: true
+              loop: "{{ nvidia_packages }}"
+            - name: Freeze NVIDIA fabric and CUDA
+              ansible.builtin.dpkg_selections:
+                name: "{{ item }}"
+                selection: hold
+              loop: "{{ nvidia_packages }}"
+            post_tasks:
+            - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
+              ansible.builtin.service:
+                name: nvidia-dcgm.service
+                state: stopped
+                enabled: false
+      - type: ansible-local
+        destination: install_mellanox_drivers.yml
+        content: |
+          ---
+          - name: Update Netplan and Install Network Utils
+            hosts: all
+            become: true
+            tasks:
+            - name: Install Linux Modules Extra
+              ansible.builtin.package:
+                name:
+                - ibverbs-utils
+                state: present
+            - name: Apply netplan
+              ansible.builtin.command: netplan apply
+
+- group: image
+  modules:
+  - id: slurm-a3ultra-image
+    source: modules/packer/custom-image
+    kind: packer
+    settings:
+      disk_size: $(vars.disk_size_gb)
+      machine_type: $(vars.image_build_machine_type)
+      source_image_family: $(vars.base_image.family)
+      source_image_project_id: [$(vars.base_image.project)]
+      image_family: $(vars.instance_image.family)
+      omit_external_ip: false
+    use:
+    - slurm-image-network
+    - slurm-build-script
+
+- group: cluster-env
+  modules:
+  - id: a3ultra-slurm-net-0
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-0
+      mtu: 8896
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net0_range)
+
+  - id: a3ultra-slurm-net-1
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-1
+      mtu: 8896
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net1_range)
+
+  - id: a3ultra-slurm-rdma-net
+    source: modules/network/gpu-rdma-vpc
+    settings:
+      network_name: $(vars.deployment_name)-rdma-net
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      nic_type: MRDMA
+      subnetworks_template:
+        name_prefix: $(vars.deployment_name)-mrdma-sub
+        count: 8
+        ip_range: $(vars.rdma_net_range)
+        region: $(vars.region)
+
+  - id: homefs
+    source: modules/file-system/filestore
+    use:
+    - a3ultra-slurm-net-0
+    settings:
+      filestore_tier: HIGH_SCALE_SSD
+      size_gb: 10240
+      local_mount: /home
+      reserved_ip_range: $(vars.filestore_ip_range)
+      deletion_protection:
+        enabled: true
+        reason: Avoid data loss
+    outputs:
+    - network_storage
+
+- group: cluster
+  modules:
+  - id: a3ultra_startup
+    source: modules/scripts/startup-script
+    settings:
+      local_ssd_filesystem:
+        mountpoint: $(vars.local_ssd_mountpoint)
+        permissions: "1777" # must quote numeric filesystem permissions!
+      docker:
+        enabled: true
+        world_writable: true
+        daemon_config: |
+          {
+            "data-root": "$(vars.local_ssd_mountpoint)/docker"
+          }
+      runners: $(flatten([vars.a3u_runners]))
+
+  - id: a3_ultra_nodeset
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [a3ultra-slurm-net-0, a3ultra_startup]
+    settings:
+      bandwidth_tier: gvnic_enabled
+      machine_type: a3-ultragpu-8g
+      instance_image_custom: true
+      enable_public_ips: true
+      node_count_static: $(vars.a3u_cluster_size)
+      node_count_dynamic_max: 0
+      enable_placement: false
+      disk_type: hyperdisk-balanced
+      on_host_maintenance: TERMINATE
+      reservation_name: $(vars.a3u_reservation_name)
+      additional_networks:
+        $(concat(
+          [{
+            network=null,
+            subnetwork=a3ultra-slurm-net-1.subnetwork_self_link,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip="",
+            stack_type=null,
+            access_config=[],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+          a3ultra-slurm-rdma-net.subnetwork_interfaces
+        ))
+
+  - id: a3_ultra_partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use:
+    - a3_ultra_nodeset
+    settings:
+      exclusive: false
+      partition_name: a3ultra
+      is_default: true
+      partition_conf:
+        ResumeTimeout: 900
+        SuspendTimeout: 600
+        OverSubscribe: EXCLUSIVE
+
+  - id: controller_startup
+    source: modules/scripts/startup-script
+    settings:
+      runners: $(flatten([vars.shared_runners, vars.controller_runners, vars.gcsfuse_runners]))
+
+  - id: login_startup
+    source: modules/scripts/startup-script
+    settings:
+      runners: $(flatten([vars.shared_runners, vars.gcsfuse_runners]))
+
+  - id: slurm_login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    use: [a3ultra-slurm-net-0]
+    settings:
+      instance_image_custom: true
+      disk_size_gb: 300
+      enable_login_public_ips: true
+      machine_type: n2-standard-8
+
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    use:
+    - a3ultra-slurm-net-0
+    - a3_ultra_partition
+    - slurm_login
+    - homefs
+    settings:
+      enable_controller_public_ips: true
+      instance_image_custom: true
+      disk_type: pd-extreme
+      disk_size_gb: 300
+      machine_type: n2-standard-80
+      controller_startup_script: $(controller_startup.startup_script)
+      login_startup_script: $(login_startup.startup_script)
+      enable_external_prolog_epilog: true
diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml
new file mode 100644
index 0000000000..d955eda1f4
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/deployment.yaml
@@ -0,0 +1,31 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+# If using GCS as a terraform backend (suggested), add the following.  If not,
+# comment out or remove.
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket:  # Name of terraform state bucket.
+# End of optional section
+
+vars:
+  deployment_name:  # Unique name of this Cluster Toolkit Deployment, e.g. a3u-gcs
+  project_id:  # Your GCP project name
+  region:  # e.g. europe-west1
+  zone:  # e.g. europe-west1-b
+  a3u_reservation_name:  # reservation name, e.g. a3u-reservation-00
+  a3u_cluster_size:  # Number of A3-Ultra nodes in the cluster
+  hns_gcs_bucket:  # This bucket must have been previously created
diff --git a/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
new file mode 100644
index 0000000000..62061533f3
--- /dev/null
+++ b/examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/run-nccl-tests-via-ramble.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -eu
+
+trap "printf '\nCaught Ctrl+c. Exiting...\n'; exit" INT
+
+# Use current unix timestamp as a unique tag
+# for jobs submitted
+TAG=$(date +%s)
+TEST_DIR=nccl-tests-"${TAG}"
+SOFTWARE_INSTALL=/opt/apps
+
+cat <<EOF
+This script will install the following packages using on this VM:
+  build-essential
+  g++-12
+  gcc-12
+  gfortran-12
+  jq
+  libgcc-12-dev
+  libgfortran-12-dev
+  libopenmpi-dev
+  openmpi-bin
+  python3-venv
+
+And will clone spack (https://github.com/spack/spack.git)
+and ramble (https://github.com/GoogleCloudPlatform/ramble.git)
+to "${SOFTWARE_INSTALL}"/. Afterwards it will create a ramble workspace to run a
+number of NCCL tests in $(readlink -f "${TEST_DIR}"/). As part of the build
+process, spack will add some configuration files to your "${HOME}"/.spack
+directory.
+
+EOF
+read -rp "To continue, hit any key. To cancel, [Ctrl-c]"
+
+mkdir -p "${TEST_DIR}"
+
+# Install prerequisites
+sudo apt-get install -y g++-12 gfortran-12 build-essential gcc-12 libgfortran-12-dev libgcc-12-dev python3-venv jq libopenmpi-dev openmpi-bin
+
+# Install ramble and spack, and make world read/writeable.
+sudo git clone --depth 1 -c feature.manyFiles=true https://github.com/GoogleCloudPlatform/ramble.git "${SOFTWARE_INSTALL}"/ramble || true
+sudo git clone --depth 1 -c feature.manyFiles=true -b develop https://github.com/spack/spack.git "${SOFTWARE_INSTALL}"/spack || true
+sudo chmod -R a+w "${SOFTWARE_INSTALL}"/{ramble,spack}
+
+# Create python environment for ramble, and install requirements
+python3 -m venv "${SOFTWARE_INSTALL}"/ramble/env || true
+source "${SOFTWARE_INSTALL}"/ramble/env/bin/activate
+pip install -q -r "${SOFTWARE_INSTALL}"/ramble/requirements.txt
+
+# Activate ramble and spack
+. ${SOFTWARE_INSTALL}/ramble/share/ramble/setup-env.sh
+. ${SOFTWARE_INSTALL}/spack/share/spack/setup-env.sh
+
+# Set up Spack external packages
+spack external find python diffutils xz ncurses flex curl openssl m4 openssh
+spack external find -p /usr/local/cuda cuda
+
+# Create a new workspace for this work
+ramble workspace create -a -d "${TEST_DIR}"
+
+# Populate ramble.yaml
+cat <<EOF >"${TEST_DIR}"/configs/ramble.yaml
+# Ramble Configuration for NCCL Tests
+ramble:
+  env_vars:
+    set:
+      OMPI_MCA_pml: "^ucx"
+      OMPI_MCA_btl: "^openib"
+      OMPI_MCA_btl_tcp_if_include: enp0s19
+
+      CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+      NCCL_NET: gIB
+      NCCL_SOCKET_IFNAME: enp0s19,enp192s20
+      NCCL_CROSS_NIC: 0
+      NCCL_NET_GDR_LEVEL: PIX
+      NCCL_P2P_NET_CHUNKSIZE: 131072
+      NCCL_P2P_PCI_CHUNKSIZE: 131072
+      NCCL_P2P_NVL_CHUNKSIZE: 524288
+      NCCL_NVLS_CHUNKSIZE: 524288
+      NCCL_IB_GID_INDEX: 3
+      NCCL_IB_ADAPTIVE_ROUTING: 1
+      NCCL_IB_QPS_PER_CONNECTION: 4
+      NCCL_IB_TC: 52
+      NCCL_IB_FIFO_TC: 84
+      NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE: /usr/local/gib/configs/guest_config.txtpb
+      NCCL_TUNER_CONFIG_PATH: /usr/local/gib/configs/tuner_config.txtpb
+    prepend:
+    - paths:
+        LD_LIBRARY_PATH: /usr/local/gib/lib64
+
+  variables:
+    mpi_command: srun --mpi=pmix
+    batch_submit: 'sbatch {execute_experiment}'
+    processes_per_node: '{gpus_per_node}'
+    gpus_per_node: '8'
+  applications:
+    nccl-tests:
+      workloads:
+        '{workload}':
+          experiments:
+            '{workload}-{n_nodes}':
+              variants:
+                package_manager: spack
+              variables:
+                workload: [all-gather, all-reduce, reduce-scatter]
+                n_nodes: [2, 4, 8, 16, 32]
+              matrix:
+              - n_nodes
+              - workload
+
+  software:
+    packages:
+      pmix:
+        pkg_spec: pmix
+      mpi:
+        pkg_spec: openmpi +cuda cuda_arch=90
+      cuda:
+        pkg_spec: cuda@12.4.0
+      nccl:
+        pkg_spec: nccl@2.23.4-1 cuda_arch=90
+      nccl-tests:
+        pkg_spec: nccl-tests cuda_arch=90
+    environments:
+      nccl-tests:
+        packages: [cuda, mpi, nccl, nccl-tests, pmix]
+
+EOF
+
+# Populate slurm sbatch script
+cat <<EOF >"${TEST_DIR}"/configs/execute_experiment.tpl
+#!/bin/bash
+#SBATCH -J {experiment_name}-"${TAG}"
+#SBATCH --output={experiment_run_dir}/slurm-%j.out
+#SBATCH -N {n_nodes}
+#SBATCH --gpus-per-node=8
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node={processes_per_node}
+
+cd "{experiment_run_dir}"
+{command}
+EOF
+
+# Get number of nodes available
+N_NODES=$(sinfo -h -o %D)
+
+# Print available benchmarks
+printf "\n--------- Setting up Benchmarks ----------\n"
+ramble workspace info --where '{n_nodes} <= '"$N_NODES"
+
+printf "\n------- About to run the following: ------\n\n"
+printf "source %s/ramble/env/bin/activate\n" "${SOFTWARE_INSTALL}"
+printf ". %s/ramble/share/ramble/setup-env.sh\n" "${SOFTWARE_INSTALL}"
+printf ". %s/spack/share/spack/setup-env.sh\n" "${SOFTWARE_INSTALL}"
+printf "ramble workspace activate %s\n" "${TEST_DIR}"
+printf "ramble workspace setup --where '{n_nodes} <= %s'\n" "${N_NODES}"
+printf "ramble on --where '{n_nodes} <= %s' \n" "${N_NODES}"
+
+# Set up experiments
+printf "\n--------- Setting up Benchmarks -------\n"
+printf "         This may take 20-30 minutes     \n"
+ramble workspace setup --where '{n_nodes} <= '"${N_NODES}"
+
+# Submit Experiments to Slurm
+printf "\n----------- Running Benchmarks --------\n"
+ramble on --where '{n_nodes} <= '"${N_NODES}"
+
+# Wait for all to be done
+# Use the TAG in the slurm jobs
+until [[ $(squeue -h -o %j | grep -c "${TAG}") -eq 0 ]]; do
+	clear
+	echo "waiting for $(squeue -h -o %j | grep -c "${TAG}") jobs to finish"
+	squeue
+	sleep 5
+done
+
+# Analyze
+ramble workspace analyze -f json --where '{n_nodes} <= '"${N_NODES}"
+
+# Summarize all results in summary.tsv
+cd "${TEST_DIR}"
+jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
+{
+  experiment_name: $exp.name,
+  workload: $exp.workload_name,
+  n_nodes: $exp.n_nodes,
+  Context: $context.name
+} +
+($context.foms | from_entries )
+| [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"])
+| @tsv' results.latest.json >summary.tsv
+
+# Print just the 8GB message sizes
+printf "\n--- SUMMARY for 8GB Message Sizes --\n"
+jq -r '["workload","n_nodes","msg_size","busbw"], (.experiments[] as $exp | $exp.CONTEXTS[] as $context |
+{
+  experiment_name: $exp.name,
+  workload: $exp.workload_name,
+  n_nodes: $exp.n_nodes,
+  Context: $context.name
+} +
+($context.foms | from_entries )
+| select(.Size | tonumber  > 8000000000)
+| [.workload, .n_nodes, .Size, ."Out of Place Bus Bandwidth"])
+| @tsv' results.latest.json
+printf "\nFor full results, see \"summary.tsv\"\n"
+
+printf "\n- To reactivate this ramble workspace, run -\n\n"
+printf "source %s/ramble/env/bin/activate\n" "${SOFTWARE_INSTALL}"
+printf ". %s/ramble/share/ramble/setup-env.sh\n" "${SOFTWARE_INSTALL}"
+printf ". %s/spack/share/spack/setup-env.sh\n" "${SOFTWARE_INSTALL}"
+printf "ramble workspace activate %s\n" "${TEST_DIR}"
diff --git a/examples/machine-learning/a3-ultragpu-8g/README.md b/examples/machine-learning/a3-ultragpu-8g/README.md
new file mode 100644
index 0000000000..dfa3bb17c5
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/README.md
@@ -0,0 +1,16 @@
+# A3 Ultra Blueprints
+
+For further information on deploying an A3 Ultra cluster with Slurm, please
+see:
+
+[Create A3 Ultra Slurm Cluster](https://cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster)
+
+If you are unable to access these documents, please contact your
+[Technical Account Manager (TAM)](https://cloud.google.com/tam).
+
+## Deploy A3 Ultra compute VM with custom startup-scripts
+
+Customers can deploy [a3ultra-vm.yaml] blueprint to deploy 2 A3 Ultra VMs. You
+can also specify custom startup-scripts to run in the blueprint.
+
+[a3ultra-vm.yaml]: ./a3ultra-vm.yaml
diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
new file mode 100644
index 0000000000..29b08add88
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
@@ -0,0 +1,451 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+# This blueprint uses private preview functionality in limited availability,
+# see README.md for further information
+
+# This blueprint requires a Cluster Toolkit binary built from a
+# release >= 1.44.0
+
+blueprint_name: a3ultra-slurm
+
+vars:
+  deployment_name: # supply deployment name
+  project_id: # supply project ID
+  region: # supply region
+  zone: # supply zone
+  a3u_cluster_size: # supply cluster size
+  a3u_reservation_name: # supply reservation name
+  # Image settings
+  base_image:
+    project: ubuntu-os-accelerator-images
+    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
+  image_build_machine_type: n2-standard-16
+  build_slurm_from_git_ref: 6.8.7
+  # Cluster env settings
+  # net0 and filestore ranges must not overlap
+  net0_range: 192.168.0.0/19
+  filestore_ip_range: 192.168.32.0/24
+  net1_range: 192.168.64.0/18
+  rdma_net_range: 192.168.128.0/18
+  # Cluster Settings
+  local_ssd_mountpoint: /mnt/localssd
+  instance_image:
+    project: $(vars.project_id)
+    family: $(vars.deployment_name)-u22
+  disk_size_gb: 200
+  nccl_plugin_version: v1.0.2
+
+deployment_groups:
+- group: image-env
+  modules:
+  - id: slurm-image-network
+    source: modules/network/vpc
+
+  - id: slurm-build-script
+    source: modules/scripts/startup-script
+    settings:
+      install_ansible: true
+      docker:
+        enabled: true
+      runners:
+      - type: data
+        destination: /etc/cluster_toolkit/a3ultra-prod-slurm-image.yaml
+        source: ../.ghpc/artifacts/expanded_blueprint.yaml
+      - type: data
+        destination: /var/tmp/slurm_vars.json
+        content: |
+          {
+            "reboot": false,
+            "install_cuda": false,
+            "install_gcsfuse": true,
+            "install_lustre": false,
+            "install_ompi": true,
+            "update_kernel": false,
+            "monitoring_agent": "cloud-ops",
+          }
+      - type: shell
+        destination: install_slurm.sh
+        content: |
+          #!/bin/bash
+          set -e -o pipefail
+          ansible-pull \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C $(vars.build_slurm_from_git_ref) \
+              -i localhost, --limit localhost --connection=local \
+              -e @/var/tmp/slurm_vars.json \
+              ansible/playbook.yml
+            # this duplicates the ulimits configuration of the HPC VM Image
+      - type: data
+        destination: /etc/security/limits.d/99-unlimited.conf
+        content: |
+          * - memlock unlimited
+          * - nproc unlimited
+          * - stack unlimited
+          * - nofile 1048576
+          * - cpu unlimited
+          * - rtprio unlimited
+      - type: data
+        destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf
+        content: |
+          [Service]
+          LimitNOFILE=infinity
+      - type: data
+        destination: /etc/netplan/60-cloud-mrdma-init.yaml
+        content: |
+          network:
+            ethernets:
+              primary:
+                match:
+                  name: enp0s*
+                  driver: gve
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: true
+                dhcp6: true
+                dhcp6-overrides:
+                  use-domains: true
+                optional: true
+              secondary:
+                match:
+                  driver: gve
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: false
+                  use-dns: false
+                  use-ntp: false
+                dhcp6: true
+                dhcp6-overrides:
+                  use-domains: false
+                  use-dns: false
+                  use-ntp: false
+                optional: true
+              mrdma_devices:
+                match:
+                  driver: mlx5_core
+                dhcp-identifier: mac
+                dhcp4: true
+                dhcp4-overrides:
+                  use-domains: true
+                  use-dns: false
+                  use-ntp: false
+                optional: true
+            version: 2
+      - type: ansible-local
+        destination: configure_gpu.yml
+        content: |
+          ---
+          - name: Install NVIDIA packages
+            hosts: all
+            become: true
+            vars:
+              distribution: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.','') }}"
+              cuda_repo_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ distribution }}/x86_64/cuda-keyring_1.1-1_all.deb
+              cuda_repo_filename: /tmp/{{ cuda_repo_url | basename }}
+              enable_nvidia_dcgm: false
+              nvidia_packages:
+              - cuda-toolkit-12-4
+              - datacenter-gpu-manager
+              - libnvidia-nscq-550
+            tasks:
+            - name: Download NVIDIA repository package
+              ansible.builtin.get_url:
+                url: "{{ cuda_repo_url }}"
+                dest: "{{ cuda_repo_filename }}"
+            - name: Install NVIDIA repository package
+              ansible.builtin.apt:
+                deb: "{{ cuda_repo_filename }}"
+                state: present
+            - name: Reduce NVIDIA repository priority
+              ansible.builtin.copy:
+                dest: /etc/apt/preferences.d/cuda-repository-pin-600
+                mode: 0o0644
+                owner: root
+                group: root
+                content: |
+                  Package: nsight-compute
+                  Pin: origin *ubuntu.com*
+                  Pin-Priority: -1
+
+                  Package: nsight-systems
+                  Pin: origin *ubuntu.com*
+                  Pin-Priority: -1
+
+                  Package: *
+                  Pin: release l=NVIDIA CUDA
+                  Pin-Priority: 400
+            - name: Install NVIDIA fabric and CUDA
+              ansible.builtin.apt:
+                name: "{{ item }}"
+                update_cache: true
+              loop: "{{ nvidia_packages }}"
+            - name: Freeze NVIDIA fabric and CUDA
+              ansible.builtin.dpkg_selections:
+                name: "{{ item }}"
+                selection: hold
+              loop: "{{ nvidia_packages }}"
+            post_tasks:
+            - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
+              ansible.builtin.service:
+                name: nvidia-dcgm.service
+                state: stopped
+                enabled: false
+      - type: ansible-local
+        destination: install_mellanox_drivers.yml
+        content: |
+          ---
+          - name: Update Netplan and Install Network Utils
+            hosts: all
+            become: true
+            tasks:
+            - name: Install Linux Modules Extra
+              ansible.builtin.package:
+                name:
+                - ibverbs-utils
+                state: present
+            - name: Apply netplan
+              ansible.builtin.command: netplan apply
+
+- group: image
+  modules:
+  - id: slurm-a3ultra-image
+    source: modules/packer/custom-image
+    kind: packer
+    settings:
+      disk_size: $(vars.disk_size_gb)
+      machine_type: $(vars.image_build_machine_type)
+      source_image_family: $(vars.base_image.family)
+      source_image_project_id: [$(vars.base_image.project)]
+      image_family: $(vars.instance_image.family)
+      omit_external_ip: false
+    use:
+    - slurm-image-network
+    - slurm-build-script
+
+- group: cluster-env
+  modules:
+  - id: a3ultra-slurm-net-0
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-0
+      mtu: 8896
+      enable_internal_traffic: false # Setting firewall below instead
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net0_range)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-0
+        ranges: [$(vars.net0_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: a3ultra-slurm-net-1
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-1
+      mtu: 8896
+      enable_internal_traffic: false # Setting firewall below instead
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net1_range)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-1
+        ranges: [$(vars.net1_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: a3ultra-slurm-rdma-net
+    source: modules/network/gpu-rdma-vpc
+    settings:
+      network_name: $(vars.deployment_name)-rdma-net
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      subnetworks_template:
+        name_prefix: $(vars.deployment_name)-mrdma-sub
+        count: 8
+        ip_range: $(vars.rdma_net_range)
+        region: $(vars.region)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-rdma
+        ranges: [$(vars.rdma_net_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: homefs
+    source: modules/file-system/filestore
+    use:
+    - a3ultra-slurm-net-0
+    settings:
+      filestore_tier: HIGH_SCALE_SSD
+      size_gb: 10240
+      local_mount: /home
+      reserved_ip_range: $(vars.filestore_ip_range)
+      deletion_protection:
+        enabled: true
+        reason: Avoid data loss
+    outputs:
+    - network_storage
+
+- group: cluster
+  modules:
+  - id: a3ultra_startup
+    source: modules/scripts/startup-script
+    settings:
+      local_ssd_filesystem:
+        mountpoint: $(vars.local_ssd_mountpoint)
+        permissions: "1777" # must quote numeric filesystem permissions!
+      docker:
+        enabled: true
+        world_writable: true
+        daemon_config: |
+          {
+            "data-root": "$(vars.local_ssd_mountpoint)/docker"
+          }
+      runners:
+      - type: data
+        destination: /etc/enroot/enroot.conf
+        content: |
+          ENROOT_RUNTIME_PATH    $(vars.local_ssd_mountpoint)/${UID}/enroot/runtime
+          ENROOT_CACHE_PATH      $(vars.local_ssd_mountpoint)/${UID}/enroot/cache
+          ENROOT_DATA_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot/data
+          ENROOT_TEMP_PATH       $(vars.local_ssd_mountpoint)/${UID}/enroot
+      - type: ansible-local
+        destination: nccl_plugin.yml
+        content: |
+          ---
+          - name: Install NCCL plugin for A3 Ultra series
+            hosts: all
+            become: true
+            tasks:
+            - name: Add SystemD unit for NCCL plugin installation
+              ansible.builtin.copy:
+                dest: /etc/systemd/system/nccl-plugin@.service
+                mode: 0o0644
+                content: |
+                  [Unit]
+                  After=network-online.target
+                  Before=slurmd.service
+
+                  [Service]
+                  Type=oneshot
+                  ExecStartPre=/usr/bin/rm -rf /usr/local/gib
+                  ExecStartPre=/usr/bin/mkdir -p /usr/local/gib
+                  ExecStartPre=/snap/bin/gcloud auth configure-docker --quiet us-docker.pkg.dev
+                  ExecStart=/usr/bin/docker run --rm --name nccl-gib-installer --volume /usr/local/gib:/var/lib/gib \
+                      us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:%i install --install-nccl
+
+                  [Install]
+                  WantedBy=slurmd.service
+              notify:
+              - Reload SystemD
+            handlers:
+            - name: Reload SystemD
+              ansible.builtin.systemd:
+                daemon_reload: true
+            post_tasks:
+            - name: Enable NCCL plugin SystemD unit
+              ansible.builtin.service:
+                name: nccl-plugin@$(vars.nccl_plugin_version).service
+                state: started
+                enabled: true
+
+  - id: a3_ultra_nodeset
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use: [a3ultra-slurm-net-0, a3ultra_startup]
+    settings:
+      bandwidth_tier: gvnic_enabled
+      machine_type: a3-ultragpu-8g
+      instance_image_custom: true
+      enable_public_ips: true
+      node_count_static: $(vars.a3u_cluster_size)
+      node_count_dynamic_max: 0
+      enable_placement: false
+      disk_type: hyperdisk-balanced
+      on_host_maintenance: TERMINATE
+      reservation_name: $(vars.a3u_reservation_name)
+      additional_networks:
+        $(concat(
+          [{
+            network=null,
+            subnetwork=a3ultra-slurm-net-1.subnetwork_self_link,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip="",
+            stack_type=null,
+            access_config=[],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+          a3ultra-slurm-rdma-net.subnetwork_interfaces
+        ))
+
+  - id: a3_ultra_partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use:
+    - a3_ultra_nodeset
+    settings:
+      exclusive: false
+      partition_name: a3ultra
+      is_default: true
+      partition_conf:
+        ResumeTimeout: 900
+        SuspendTimeout: 600
+
+  - id: slurm_login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    use: [a3ultra-slurm-net-0]
+    settings:
+      instance_image_custom: true
+      disk_size_gb: 300
+      enable_login_public_ips: true
+      machine_type: n2-standard-8
+
+  - id: controller_startup
+    source: modules/scripts/startup-script
+    settings:
+      runners:
+      - type: shell
+        destination: stage_scripts.sh
+        content: |
+          #!/bin/bash
+          SLURM_ROOT=/opt/apps/adm/slurm
+          PARTITION_NAME=$(a3_ultra_partition.partitions[0].partition_name)
+          mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
+          mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
+          ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
+
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    use:
+    - a3ultra-slurm-net-0
+    - a3_ultra_partition
+    - slurm_login
+    - homefs
+    settings:
+      enable_controller_public_ips: true
+      instance_image_custom: true
+      disk_type: pd-extreme
+      disk_size_gb: 300
+      machine_type: n2-standard-80
+      controller_startup_script: $(controller_startup.startup_script)
+      enable_external_prolog_epilog: true
diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml
new file mode 100644
index 0000000000..6fa29af09e
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-deployment.yaml
@@ -0,0 +1,26 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket: # supply existing bucket to store Terraform state
+
+vars:
+  deployment_name: # supply unique deployment name
+  project_id: # supply existing project id
+  region: # supply region with a3-ultragpu-8g capacity in reservation
+  zone: # supply zone with a3-ultragpu-8g capacity in reservation
+  a3u_reservation_name: # supply a3-ultragpu-8g reservation name
+  a3u_cluster_size: # supply a3-ultragpu-8g reservation size
diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml
new file mode 100644
index 0000000000..25d7fd83bf
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/a3ultra-vm.yaml
@@ -0,0 +1,151 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: a3ultra-vm-instance
+
+vars:
+  project_id: # supply project ID
+  deployment_name: a3ultra-vm-instance
+  region: europe-west1
+  zone: europe-west1-b
+  instance_image:
+    project: ubuntu-os-accelerator-images
+    family: ubuntu-accelerator-2204-amd64-with-nvidia-550
+  net0_range: 192.168.0.0/19
+  net1_range: 192.168.64.0/18
+  filestore_ip_range: 192.168.32.0/24
+  rdma_net_range: 192.168.128.0/18
+  hostname_prefix: $(vars.deployment_name)-beowulf
+
+deployment_groups:
+- group: primary
+  modules:
+
+  - id: a3ultra-net-0
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-0
+      mtu: 8896
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net0_range)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-0
+        ranges: [$(vars.net0_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: a3ultra-net-1
+    source: modules/network/vpc
+    settings:
+      network_name: $(vars.deployment_name)-net-1
+      mtu: 8896
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: $(vars.net1_range)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-1
+        ranges: [$(vars.net1_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: a3ultra-rdma-net
+    source: modules/network/gpu-rdma-vpc
+    settings:
+      network_name: $(vars.deployment_name)-rdma-net
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      subnetworks_template:
+        name_prefix: $(vars.deployment_name)-mrdma-sub
+        count: 8
+        ip_range: $(vars.rdma_net_range)
+        region: $(vars.region)
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-rdma
+        ranges: [$(vars.rdma_net_range)]
+        allow:
+        - protocol: tcp
+        - protocol: udp
+        - protocol: icmp
+
+  - id: homefs
+    source: modules/file-system/filestore
+    use: [a3ultra-net-0]
+    settings:
+      filestore_tier: HIGH_SCALE_SSD
+      size_gb: 10240
+      local_mount: /home
+      reserved_ip_range: $(vars.filestore_ip_range)
+    outputs:
+    - network_storage
+
+  - id: startup-script
+    source: modules/scripts/startup-script
+    settings:
+      configure_ssh_host_patterns:
+      - $(vars.hostname_prefix)-*
+
+  - id: a3ultra-vms
+    source: modules/compute/vm-instance
+    use: [startup-script, homefs]
+    settings:
+      machine_type: a3-ultragpu-8g
+      instance_count: 2
+      name_prefix: $(vars.hostname_prefix)
+      disk_type: hyperdisk-balanced
+      automatic_restart: true
+      on_host_maintenance: TERMINATE
+      reservation_name: # supply reservation name
+      network_interfaces:
+        $(concat(
+          [{
+            network=null,
+            subnetwork=a3ultra-net-0.subnetwork_self_link,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          },
+          {
+            network=null,
+            subnetwork=a3ultra-net-1.subnetwork_self_link,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+          a3ultra-rdma-net.subnetwork_interfaces,
+        ))
+
+  - id: wait-for-vms
+    source: community/modules/scripts/wait-for-startup
+    settings:
+      instance_names: $(a3ultra-vms.name)
+      timeout: 7200
diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md
new file mode 100644
index 0000000000..3f6dfab5c9
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/README.md
@@ -0,0 +1,89 @@
+The examples in this directory are used to show how enroot + pyxis can be used
+to launch containerized workloads via Slurm.
+
+Contents:
+
+* `build-nccl-tests.sh`: A Slurm batch script for building the nccl-tests.
+* `run-nccl-tests.sh`: A Slurm batch script for running the nccl-tests
+  `all_reduce_perf` benchmark.
+* `import_container.sh`: Uses enroot to create a squashfs container image. Added
+  for reference only. enroot import happens within the `build-nccl-tests.sh`.
+
+# Running NCCL-Tests via Enroot/Pyxis
+
+In general the workflow to deploy GPUDirect-RDMA-enabled workloads via enroot-pyxis is
+the following:
+
+1. Convert your container into a squashfs based container image
+2. Set required environment variables
+3. Run your application workload
+
+## TLDR
+
+For an end-to-end example, copy the `build-nccl-tests.sh` and
+`run-nccl-tests.sh` to your login node.
+
+And run the following:
+
+```text
+BUILD_JOB=$(sbatch --parsable build-nccl-tests.sh) # takes ~4 minutes
+sbatch -d afterok:${BUILD_JOB} run-nccl-tests.sh # takes ~3 minutes
+```
+
+The latter should result in a slurm-XX.out file that contains the result of the nccl
+`all_gather_perf` benchmark:
+
+```text
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+   268435456       4194304     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+   536870912       8388608     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+  1073741824      16777216     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+  2147483648      33554432     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+  4294967296      67108864     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+  8589934592     134217728     float    none      -1    XXXXX  XXX.XX  XXX.XX    N/A   XXXXXX  XXX.XX  XXX.XX      0
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : XXX.XX
+#
+```
+
+For more details, follow the remainder of this README.
+
+## Detailed Instructions
+
+All of the following should be done on the login node of your slurm cluster,
+and while somewhere on the shared Filestore filesystem (typically the user's
+home directory).
+
+### Building NCCL-tests
+
+See build-nccl-tests.sh for an example. Within it, you will see that first we'll
+create a squashfs version of the container using we want to launch using `enroot
+import`. We do this because otherwise we'd be pulling the (typically more than
+10GB) image multiple times from the source on each node, converting to sqsh each
+time, etc, which would make the job launch longer.
+
+For building the nccl-tests binaries, we use `pyxis` to run the enroot container
+and build the nccl-tests within that container to ensure the resulting binarier
+are compatible with the container environment.
+
+Both of the above (importing and building) are accomplished by running:
+
+```text
+sbatch build-nccl-tests.sh
+```
+
+### Running your application on a3-ultra instances
+
+For a complete example, run:
+
+```text
+sbatch run-nccl-tests.sh
+```
+
+The output will appear in in a `slurm-<job#>.log` file. If the name of your a3-ultragpu
+partition is different than "a3ultra", you will need to modify the `build-nccl-tests.sh`
+and `run-nccl-tests.sh` scripts's  `#SBATCH --partition` setting. Alternatively, you
+can run `sbatch -p <your partition> <script>`.
diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh
new file mode 100644
index 0000000000..ee3578e241
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/build-nccl-tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --exclusive
+#SBATCH --ntasks=1
+#SBATCH --partition=a3ultra
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=8
+
+# Usage: sbatch build-nccl-tests.sh
+
+set -x
+
+CONTAINER_IMAGE=./nvidia+pytorch+24.09-py3.sqsh
+
+# Import the pytorch container to enroot if not already present.
+if [ ! -f ${CONTAINER_IMAGE} ]; then
+	# This creates a file named "nvidia+pytorch+24.09-py3.sqsh", which
+	# uses ~18 GB of disk space. This should be run on a filesystem that
+	# can be seen by all worker nodes
+	enroot import docker://nvcr.io#nvidia/pytorch:24.09-py3
+fi
+
+# Install nccl-tests using openmpi from within pytorch container
+srun --container-mounts="$PWD:/nccl" \
+	--container-image=${CONTAINER_IMAGE} \
+	bash -c "
+       cd /nccl &&
+       git clone https://github.com/NVIDIA/nccl-tests.git &&
+       cd /nccl/nccl-tests/ &&
+       MPI=1 CC=mpicc CXX=mpicxx make -j
+    "
diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh
new file mode 100644
index 0000000000..ea903be55a
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/import_pytorch_container.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This creates a file named "nvidia+pytorch+24.09-py3.sqsh", which
+# uses ~18 GB of disk space. This should be run on a filesystem that
+# can be seen by all worker nodes
+enroot import docker://nvcr.io#nvidia/pytorch:24.09-py3
diff --git a/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh
new file mode 100644
index 0000000000..08352c1578
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nccl-tests/run-nccl-tests.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --partition=a3ultra
+#SBATCH --mem=0
+#SBATCH -N 2
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+
+# Usage: sbatch run-nccl-tests.sh
+
+set -x
+# This should be set to the squashfs file that you created for your application
+CONTAINER_IMAGE=./nvidia+pytorch+24.09-py3.sqsh
+
+# Set up NCCL Environment variables
+# The following two can be useful for debugging
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=INIT,NET
+
+# These parameters should not be modified
+source /usr/local/gib/scripts/set_nccl_env.sh
+export NCCL_NET=gIB
+export NCCL_SOCKET_IFNAME=enp0s19,enp192s20
+
+# Mount /var/tmp to allow the rest of the enroot container to be read-only, and
+# mount current $PWD to /nccl to for accessing nccl-tests binary
+CONTAINER_MOUNTS="/var/tmp:/var/tmp"
+
+# Mount PWD to /nccl in the enroot container
+CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"$PWD:/nccl"
+
+# Mount required directories for gIB libnccl-net
+CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"/usr/local/gib"
+
+# Run the workload
+srun -l \
+	-N "${SLURM_NNODES}" \
+	--ntasks-per-node=8 \
+	--mpi=pmi2 \
+	--container-image="${CONTAINER_IMAGE}" \
+	--container-mounts="${CONTAINER_MOUNTS}" \
+	sh -c "
+  export LD_LIBRARY_PATH=/usr/local/gib/lib64:/usr/lib/x86_64-linux-gnu:\$LD_LIBRARY_PATH;
+  /nccl/nccl-tests/build/all_gather_perf -b 256M -e 8G -f 2 -g 1 -w 5 --iters 200;
+  "
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml b/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml
new file mode 100644
index 0000000000..9ac0457ed4
--- /dev/null
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/post-destroy-tasks/delete-image.yml
@@ -0,0 +1,30 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Assert variables are defined
+  ansible.builtin.assert:
+    that:
+    - project is defined
+    - build is defined
+
+- name: Get Image Name
+  register: image_name
+  ansible.builtin.command: gcloud compute images list --project={{ project }} --no-standard-images --filter="labels.ghpc_deployment~{{ build }}" --format='get(name)' --limit=1
+  ignore_errors: yes
+
+- name: Delete Image
+  register: delete_image_result
+  changed_when: delete_image_result.rc == 0
+  ansible.builtin.command: gcloud compute images delete --project={{ project }} --quiet {{ image_name.stdout }}
+  when: image_name.rc == 0 and image_name.stdout != ""
diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
new file mode 100644
index 0000000000..0298ee23ca
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
@@ -0,0 +1,52 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.custom-image
+- m.filestore
+- m.gpu-rdma-vpc
+- m.schedmd-slurm-gcp-v6-controller
+- m.schedmd-slurm-gcp-v6-login
+- m.schedmd-slurm-gcp-v6-nodeset
+- m.schedmd-slurm-gcp-v6-partition
+- m.startup-script
+- m.vpc
+- slurm6
+
+timeout: 14400s  # 4hr
+steps:
+- id: ml-a3-ultragpu-slurm-image
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    REGION=europe-west1
+    ZONE=europe-west1-b
+    BLUEPRINT="/workspace/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml"
+    sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT}
+    sed -i -e '/reason:/d' $${BLUEPRINT}
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
+        --user=sa_106486320838376751393 \
+        --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+        --extra-vars="region=$${REGION} zone=$${ZONE}" \
+        --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml"
diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
new file mode 100644
index 0000000000..8d62d4c5da
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
@@ -0,0 +1,45 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# region, zone must be defined in build file with --extra-vars flag!
+test_name: a3u-slurm
+deployment_name: a3u-slurm-{{ build }}
+slurm_cluster_name: "a3u{{ build[0:4] }}"
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml"
+login_node: "{{ slurm_cluster_name }}-slurm-login-*"
+controller_node: "{{ slurm_cluster_name }}-controller"
+region: europe-west1
+zone: europe-west1-b
+network: "{{ deployment_name }}-net-0"
+post_deploy_tests:
+- test-validation/test-mounts.yml
+- test-validation/test-partitions.yml
+- test-validation/test-enroot.yml
+post_destroy_tasks:
+- post-destroy-tasks/delete-image.yml
+custom_vars:
+  partitions:
+  - a3ultra
+  mounts:
+  - /home
+cli_deployment_vars:
+  region: "{{ region }}"
+  zone: "{{ zone }}"
+  slurm_cluster_name: "{{ slurm_cluster_name }}"
+  disk_size_gb: 200
+  a3u_cluster_size: 2
+  a3u_reservation_name: slurm-dev-gcp-a3u-gsc

From c79deb0528cb3e0dde466aa069c8681568b30827 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 21:27:59 +0000
Subject: [PATCH 130/140] Update reservation used by Slurm A3 Ultra integration
 test

---
 tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
index 8d62d4c5da..87f2f65925 100644
--- a/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
+++ b/tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml
@@ -42,4 +42,4 @@ cli_deployment_vars:
   slurm_cluster_name: "{{ slurm_cluster_name }}"
   disk_size_gb: 200
   a3u_cluster_size: 2
-  a3u_reservation_name: slurm-dev-gcp-a3u-gsc
+  a3u_reservation_name: hpc-exfr-2

From b67e22dad1dabc3e50a5bd0bb84a84539231ca3c Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 21:27:59 +0000
Subject: [PATCH 131/140] Update name of Cloud Build step for A3 Ultra Slurm
 integration test

---
 tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
index 0298ee23ca..9bab10c20e 100644
--- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
+++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml
@@ -27,7 +27,7 @@ tags:
 
 timeout: 14400s  # 4hr
 steps:
-- id: ml-a3-ultragpu-slurm-image
+- id: ml-a3-ultragpu-slurm
   name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
   entrypoint: /bin/bash
   env:

From 443b8959e6ccf674ee2867fdcaafaa72ea344c7c Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Thu, 9 Jan 2025 22:32:55 +0000
Subject: [PATCH 132/140] use version prefix with release channels

---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml | 15 +++++++++------
 modules/scheduler/gke-cluster/README.md       |  2 ++
 modules/scheduler/gke-cluster/main.tf         | 18 +++++++++++++++---
 modules/scheduler/gke-cluster/variables.tf    |  6 ++++++
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 8751114b63..6c3a0354b2 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -96,7 +96,6 @@ deployment_groups:
     source: modules/scheduler/gke-cluster
     use: [gke-a3-ultra-net-0]
     settings:
-      release_channel: RAPID
       system_node_pool_machine_type: "e2-standard-16"
       system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
       system_node_pool_taints: []
@@ -106,11 +105,6 @@ deployment_groups:
       master_authorized_networks:
       - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
         display_name: "kubectl-access-network"
-      maintenance_exclusions:
-      - name: no-minor-or-node-upgrades-indefinite
-        start_time: "2024-12-01T00:00:00Z"
-        end_time: "2025-12-22T00:00:00Z"
-        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
       additional_networks:
         $(concat(
           [{
@@ -127,6 +121,15 @@ deployment_groups:
           }],
          gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
         ))
+      # Cluster versions cannot be updated through the toolkit after creation
+      # Please manage cluster version from the Google Cloud Console directly
+      version_prefix: "1.31."
+      release_channel: RAPID
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
     outputs: [instructions]
 
   - id: a3-ultragpu-pool
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index ee97622b47..84dfed0d7e 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -131,6 +131,7 @@ limitations under the License.
 |------|------|
 | [google-beta_google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_cluster) | resource |
 | [google-beta_google_container_node_pool.system_node_pools](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource |
+| [google-beta_google_container_engine_versions.version_prefix_filter](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/data-sources/google_container_engine_versions) | data source |
 | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
 | [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source |
 
@@ -192,6 +193,7 @@ limitations under the License.
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_upgrade_settings"></a> [upgrade\_settings](#input\_upgrade\_settings) | Defines gke cluster upgrade settings. It is highly recommended that you define all max\_surge and max\_unavailable.<br/>If max\_surge is not specified, it would be set to a default value of 0.<br/>If max\_unavailable is not specified, it would be set to a default value of 1. | <pre>object({<br/>    strategy        = string<br/>    max_surge       = optional(number)<br/>    max_unavailable = optional(number)<br/>  })</pre> | <pre>{<br/>  "max_surge": 0,<br/>  "max_unavailable": 1,<br/>  "strategy": "SURGE"<br/>}</pre> | no |
+| <a name="input_version_prefix"></a> [version\_prefix](#input\_version\_prefix) | If provided, Terraform will only return versions that match the string prefix. For example, `1.31.` will match all `1.31` series releases. Since this is just a string match, it's recommended that you append a `.` after minor versions to ensure that prefixes such as `1.3` don't match versions like `1.30.1-gke.10` accidentally. | `string` | `"1.31."` | no |
 | <a name="input_zone"></a> [zone](#input\_zone) | Zone for a zonal cluster. | `string` | `null` | no |
 
 ## Outputs
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 3f27b34558..21dfc1c40e 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -68,6 +68,16 @@ data "google_project" "project" {
   project_id = var.project_id
 }
 
+data "google_container_engine_versions" "version_prefix_filter" {
+  provider       = google-beta
+  location       = var.cluster_availability_type == "ZONAL" ? var.zone : var.region
+  version_prefix = var.version_prefix
+}
+
+locals {
+  master_version = var.min_master_version != null ? var.min_master_version : data.google_container_engine_versions.version_prefix_filter.latest_master_version
+}
+
 resource "google_container_cluster" "gke_cluster" {
   provider = google-beta
 
@@ -159,7 +169,7 @@ resource "google_container_cluster" "gke_cluster" {
   release_channel {
     channel = var.release_channel
   }
-  min_master_version = var.min_master_version
+  min_master_version = local.master_version
 
   maintenance_policy {
     daily_maintenance_window {
@@ -212,7 +222,8 @@ resource "google_container_cluster" "gke_cluster" {
   lifecycle {
     # Ignore all changes to the default node pool. It's being removed after creation.
     ignore_changes = [
-      node_config
+      node_config,
+      min_master_version,
     ]
     precondition {
       condition     = var.default_max_pods_per_node == null || var.networking_mode == "VPC_NATIVE"
@@ -250,7 +261,7 @@ resource "google_container_node_pool" "system_node_pools" {
   name     = var.system_node_pool_name
   cluster  = var.cluster_reference_type == "NAME" ? google_container_cluster.gke_cluster.name : google_container_cluster.gke_cluster.self_link
   location = var.cluster_availability_type == "ZONAL" ? var.zone : var.region
-  version  = var.min_master_version
+  version  = local.master_version
 
   autoscaling {
     total_min_node_count = var.system_node_pool_node_count.total_min_nodes
@@ -319,6 +330,7 @@ resource "google_container_node_pool" "system_node_pools" {
     ignore_changes = [
       node_config[0].labels,
       node_config[0].taint,
+      version,
     ]
     precondition {
       condition     = contains(["SURGE"], local.upgrade_settings.strategy)
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index 9b807e7826..a181c58239 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -91,6 +91,12 @@ variable "min_master_version" {
   default     = null
 }
 
+variable "version_prefix" {
+  description = "If provided, Terraform will only return versions that match the string prefix. For example, `1.31.` will match all `1.31` series releases. Since this is just a string match, it's recommended that you append a `.` after minor versions to ensure that prefixes such as `1.3` don't match versions like `1.30.1-gke.10` accidentally."
+  type        = string
+  default     = "1.31."
+}
+
 variable "maintenance_start_time" {
   description = "Start time for daily maintenance operations. Specified in GMT with `HH:MM` format."
   type        = string

From c8674265c1be555cfac31b4bac65aa0ebb8e03bb Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 9 Jan 2025 23:18:56 +0000
Subject: [PATCH 133/140] Update Filestore examples to use Private Service
 Access

Update all examples that create new VPCs (with the modules/network/vpc
module) to use Private Service Access. Do not update examples that use
pre-existing-vpc module because it is not possible to use Terraform to
manage Private Service Access in conflict with a pre-existing
configuration.
---
 community/examples/hpc-slurm-gromacs.yaml        | 16 ++++++++++------
 community/examples/hpc-slurm-local-ssd.yaml      |  6 +++++-
 community/examples/hpc-slurm-ubuntu2004.yaml     | 16 ++++++++++------
 community/examples/htc-slurm.yaml                |  8 ++++++--
 examples/hcls-blueprint.yaml                     |  8 ++++++--
 examples/hpc-enterprise-slurm.yaml               |  8 ++++++--
 examples/hpc-slurm.yaml                          |  6 +++++-
 examples/ml-slurm.yaml                           |  7 +++++--
 tools/cloud-build/daily-tests/builds/hcls.yaml   |  1 +
 .../daily-tests/builds/hpc-enterprise-slurm.yaml |  1 +
 .../daily-tests/builds/htc-slurm.yaml            |  1 +
 .../cloud-build/daily-tests/builds/ml-slurm.yaml |  1 +
 .../daily-tests/builds/slurm-gcp-v6-debian.yaml  |  1 +
 .../daily-tests/builds/slurm-gcp-v6-rocky8.yaml  |  1 +
 .../daily-tests/builds/slurm-gcp-v6-ssd.yaml     |  1 +
 .../daily-tests/builds/slurm-gcp-v6-ubuntu.yaml  |  1 +
 .../daily-tests/builds/spack-gromacs.yaml        |  1 +
 17 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml
index af6b8864b8..5627929414 100644
--- a/community/examples/hpc-slurm-gromacs.yaml
+++ b/community/examples/hpc-slurm-gromacs.yaml
@@ -28,19 +28,23 @@ vars:
 deployment_groups:
 - group: primary
   modules:
-  - id: network1
+  - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   ## Filesystems
   - id: appsfs
     source: modules/file-system/filestore
-    use: [network1]
+    use: [network, private_service_access]
     settings:
       local_mount: /sw
 
   - id: homefs
     source: modules/file-system/filestore
-    use: [network1]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
@@ -93,7 +97,7 @@ deployment_groups:
 
   - id: compute_nodeset
     source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
-    use: [network1]
+    use: [network]
     settings:
       node_count_dynamic_max: 20
       bandwidth_tier: gvnic_enabled
@@ -108,14 +112,14 @@ deployment_groups:
 
   - id: slurm_login
     source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
-    use: [network1]
+    use: [network]
     settings:
       enable_login_public_ips: true
 
   - id: slurm_controller
     source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use:
-    - network1
+    - network
     - compute_partition
     - slurm_login
     - homefs
diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml
index c41c332942..aeaac6a515 100644
--- a/community/examples/hpc-slurm-local-ssd.yaml
+++ b/community/examples/hpc-slurm-local-ssd.yaml
@@ -31,9 +31,13 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: homefs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml
index 475f65a317..271afd9a82 100644
--- a/community/examples/hpc-slurm-ubuntu2004.yaml
+++ b/community/examples/hpc-slurm-ubuntu2004.yaml
@@ -33,18 +33,22 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  - id: network1
+  - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: homefs
     source: modules/file-system/filestore
-    use: [network1]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
   - id: debug_nodeset
     source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
-    use: [network1]
+    use: [network]
     settings:
       instance_image: $(vars.slurm_image)
       enable_placement: false # the default is: true
@@ -61,7 +65,7 @@ deployment_groups:
 
   - id: compute_nodeset
     source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
-    use: [network1]
+    use: [network]
     settings:
       instance_image: $(vars.slurm_image)
       node_count_dynamic_max: 20
@@ -76,7 +80,7 @@ deployment_groups:
   - id: slurm_controller
     source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use:
-    - network1
+    - network
     - slurm_login
     - debug_partition
     - compute_partition
@@ -87,7 +91,7 @@ deployment_groups:
 
   - id: slurm_login
     source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
-    use: [network1]
+    use: [network]
     settings:
       instance_image: $(vars.slurm_image)
       machine_type: n2-standard-4
diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml
index 9ba26025d7..bea2b0e51c 100644
--- a/community/examples/htc-slurm.yaml
+++ b/community/examples/htc-slurm.yaml
@@ -45,15 +45,19 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: homefs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
   - id: projectsfs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       filestore_tier: HIGH_SCALE_SSD
       size_gb: 10240
diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml
index deb07e22c9..a6c128d9b5 100644
--- a/examples/hcls-blueprint.yaml
+++ b/examples/hcls-blueprint.yaml
@@ -53,6 +53,10 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   ### Resource Monitoring ###
 
   - id: hpc-dash
@@ -62,14 +66,14 @@ deployment_groups:
 
   - id: homefs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       filestore_share_name: homeshare
       local_mount: /home
 
   - id: appsfs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       filestore_share_name: appsshare
       local_mount: /apps
diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml
index afa659d563..86ba80aa83 100644
--- a/examples/hpc-enterprise-slurm.yaml
+++ b/examples/hpc-enterprise-slurm.yaml
@@ -51,6 +51,10 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: controller_sa
     source: community/modules/project/service-account
     settings:
@@ -83,13 +87,13 @@ deployment_groups:
 
   - id: homefs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
   - id: projectsfs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       local_mount: /projects
 
diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml
index 0736772569..8435a766c1 100644
--- a/examples/hpc-slurm.yaml
+++ b/examples/hpc-slurm.yaml
@@ -33,9 +33,13 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: homefs
     source: modules/file-system/filestore
-    use: [network]
+    use: [network, private_service_access]
     settings:
       local_mount: /home
 
diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml
index 1eea66d407..6064a13113 100644
--- a/examples/ml-slurm.yaml
+++ b/examples/ml-slurm.yaml
@@ -46,10 +46,13 @@ deployment_groups:
   - id: network
     source: modules/network/vpc
 
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+
   - id: homefs
     source: modules/file-system/filestore
-    use:
-    - network
+    use: [network, private_service_access]
     settings:
       local_mount: /home
       size_gb: 2560
diff --git a/tools/cloud-build/daily-tests/builds/hcls.yaml b/tools/cloud-build/daily-tests/builds/hcls.yaml
index 634bd36657..231fde0ff4 100644
--- a/tools/cloud-build/daily-tests/builds/hcls.yaml
+++ b/tools/cloud-build/daily-tests/builds/hcls.yaml
@@ -18,6 +18,7 @@ tags:
 - m.cloud-storage-bucket
 - m.dashboard
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml
index 56ed118239..ec1f05cc89 100644
--- a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml
+++ b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml
@@ -18,6 +18,7 @@ tags:
 - m.DDN-EXAScaler
 - m.dashboard
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml
index 5f4ef09dbc..53a57e5864 100644
--- a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml
+++ b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml
@@ -16,6 +16,7 @@
 tags:
 - m.filestore
 - m.DDN-EXAScaler
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml
index c06c110a54..8635aa6d6a 100644
--- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml
+++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml
@@ -16,6 +16,7 @@
 tags:
 - m.custom-image
 - m.filestore
+- m.private-service-access
 - m.vpc
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml
index 01d4126192..3455202bf5 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml
@@ -15,6 +15,7 @@
 ---
 tags:
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml
index ec8ded65f1..c435ffdeab 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml
@@ -16,6 +16,7 @@
 tags:
 - slurm6
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml
index 1f55daa395..36163a5238 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml
@@ -15,6 +15,7 @@
 ---
 tags:
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml
index 6c9c628b50..cc34b3775b 100644
--- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml
+++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml
@@ -15,6 +15,7 @@
 ---
 tags:
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset
diff --git a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml
index 7851fa6a34..f47eb03991 100644
--- a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml
+++ b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml
@@ -15,6 +15,7 @@
 ---
 tags:
 - m.filestore
+- m.private-service-access
 - m.schedmd-slurm-gcp-v6-controller
 - m.schedmd-slurm-gcp-v6-login
 - m.schedmd-slurm-gcp-v6-nodeset

From b8cb5e07176a17ee691467d7d10e83bbd151434b Mon Sep 17 00:00:00 2001
From: Swarna Bharathi Mantena <swarna.bharathi1208@gmail.com>
Date: Fri, 10 Jan 2025 05:27:35 +0000
Subject: [PATCH 134/140] update hns description

---
 community/modules/file-system/cloud-storage-bucket/README.md    | 2 +-
 community/modules/file-system/cloud-storage-bucket/variables.tf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/community/modules/file-system/cloud-storage-bucket/README.md b/community/modules/file-system/cloud-storage-bucket/README.md
index ba1d88c6e8..11f06aa75d 100644
--- a/community/modules/file-system/cloud-storage-bucket/README.md
+++ b/community/modules/file-system/cloud-storage-bucket/README.md
@@ -137,7 +137,7 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment; used as part of name of the GCS bucket. | `string` | n/a | yes |
-| <a name="input_enable_hierarchical_namespace"></a> [enable\_hierarchical\_namespace](#input\_enable\_hierarchical\_namespace) | If true, enables hierarchical namespace for the bucket. | `bool` | `false` | no |
+| <a name="input_enable_hierarchical_namespace"></a> [enable\_hierarchical\_namespace](#input\_enable\_hierarchical\_namespace) | If true, enables hierarchical namespace for the bucket. This option must be configured during the initial creation of the bucket. | `bool` | `false` | no |
 | <a name="input_force_destroy"></a> [force\_destroy](#input\_force\_destroy) | If true will destroy bucket with all objects stored within. | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to the GCS bucket. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_local_mount"></a> [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/mnt"` | no |
diff --git a/community/modules/file-system/cloud-storage-bucket/variables.tf b/community/modules/file-system/cloud-storage-bucket/variables.tf
index 727529888a..1a92faefc4 100644
--- a/community/modules/file-system/cloud-storage-bucket/variables.tf
+++ b/community/modules/file-system/cloud-storage-bucket/variables.tf
@@ -84,7 +84,7 @@ variable "viewers" {
 }
 
 variable "enable_hierarchical_namespace" {
-  description = "If true, enables hierarchical namespace for the bucket."
+  description = "If true, enables hierarchical namespace for the bucket. This option must be configured during the initial creation of the bucket."
   type        = bool
   default     = false
 }

From aee3439b5a66b5bc39b66f5fd4dd0b0401c921a3 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Fri, 10 Jan 2025 14:56:50 +0000
Subject: [PATCH 135/140] Update network storage document after v5 deprecation

---
 docs/network_storage.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/network_storage.md b/docs/network_storage.md
index 40065edba3..242d9895e4 100644
--- a/docs/network_storage.md
+++ b/docs/network_storage.md
@@ -98,19 +98,19 @@ The following is an example setting up a filestore using startup script:
 The following matrix shows the best method by which each type of network storage
 device should be mounted to each mount capable module.
 
-&nbsp; | Slurm V6 | Slurm V5 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\*
--- | -- | -- | -- | -- | -- | -- | --
-filestore | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE
-nfs-server | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE
-cloud-storage-bucket (GCS)| via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE
-DDN EXAScaler lustre | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE
-Parallelstore | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing
-  |  |   |   |   |   |   |  
-filestore (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE
-nfs-server (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE
-DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE
-Parallelstore (pre-existing) | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing
-GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing
+&nbsp; | Slurm V6 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\*
+-- | -- | -- | -- | -- | -- | --
+filestore | via USE | via USE | via USE | via STARTUP | via USE | via USE
+nfs-server | via USE | via USE | via USE | via STARTUP | via USE | via USE
+cloud-storage-bucket (GCS)| via USE | via USE | via USE | via STARTUP | via USE | via USE
+DDN EXAScaler lustre | via USE | via USE | via USE | Needs Testing | via USE | via USE
+Parallelstore | via USE | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing
+  |  |   |   |   |   |  
+filestore (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE
+nfs-server (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE
+DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | Needs Testing | via USE | via USE
+Parallelstore (pre-existing) | via USE | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing
+GCS FUSE (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing
 
 - **via USE:** Client installation and mounting occur automatically when
   connected with the use field. See

From fe7ab566a700ccec85b8b4ac1e8308ee4b609249 Mon Sep 17 00:00:00 2001
From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com>
Date: Thu, 9 Jan 2025 18:36:03 +0000
Subject: [PATCH 136/140] Add jobset v0.7.2 version

Update a3u blueprint to use new jobset version
Remove the older jobset v0.7.1 as it had bugs
---
 examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml                 | 2 +-
 .../manifests/{jobset-v0.7.1.yaml => jobset-v0.7.2.yaml}      | 4 ++--
 modules/management/kubectl-apply/variables.tf                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename modules/management/kubectl-apply/manifests/{jobset-v0.7.1.yaml => jobset-v0.7.2.yaml} (99%)

diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
index 6c3a0354b2..cf2affbc3c 100644
--- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
+++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -182,7 +182,7 @@ deployment_groups:
         version: v0.10.0
       jobset:
         install: true
-        version: v0.7.1
+        version: v0.7.2
       apply_manifests:
       - source: $(vars.nccl_installer_path)
       - source: $(vars.mglru_disable_path)
diff --git a/modules/management/kubectl-apply/manifests/jobset-v0.7.1.yaml b/modules/management/kubectl-apply/manifests/jobset-v0.7.2.yaml
similarity index 99%
rename from modules/management/kubectl-apply/manifests/jobset-v0.7.1.yaml
rename to modules/management/kubectl-apply/manifests/jobset-v0.7.2.yaml
index 876fd30825..a600eaf308 100644
--- a/modules/management/kubectl-apply/manifests/jobset-v0.7.1.yaml
+++ b/modules/management/kubectl-apply/manifests/jobset-v0.7.2.yaml
@@ -1,4 +1,4 @@
-# Copyright 2024 "Google LLC"
+# Copyright 2025 "Google LLC"
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -8747,7 +8747,7 @@ spec:
         - --leader-elect
         command:
         - /manager
-        image: registry.k8s.io/jobset/jobset:v0.7.1
+        image: registry.k8s.io/jobset/jobset:v0.7.2
         livenessProbe:
           httpGet:
             path: /healthz
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index 14b91b1c8e..0b2f469d50 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -16,7 +16,7 @@
 
 locals {
   kueue_supported_versions  = ["v0.10.0", "v0.9.1", "v0.9.0", "v0.8.1"]
-  jobset_supported_versions = ["v0.7.1", "v0.5.2"]
+  jobset_supported_versions = ["v0.7.2", "v0.5.2"]
 }
 
 resource "terraform_data" "kueue_validations" {

From c4da7fefb814e7df7b67857461f8e3995e2a8ebd Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Fri, 10 Jan 2025 18:47:29 +0000
Subject: [PATCH 137/140] A3 Ultra Slurm standard blueprint: enable
 OverSubscribe=EXCLUSIVE

---
 .../machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
index 29b08add88..39718c8842 100644
--- a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
+++ b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml
@@ -408,6 +408,7 @@ deployment_groups:
       partition_name: a3ultra
       is_default: true
       partition_conf:
+        OverSubscribe: EXCLUSIVE
         ResumeTimeout: 900
         SuspendTimeout: 600
 

From 9b9bd8c149b80943c4959724e89761936f36689f Mon Sep 17 00:00:00 2001
From: Akiki Liang <asq@google.com>
Date: Mon, 30 Dec 2024 22:11:44 +0000
Subject: [PATCH 138/140] add NeMo to a3ultra

---
 .../a3-ultragpu-8g/nemo-framework/Dockerfile  | 39 +++++++++
 .../a3-ultragpu-8g/nemo-framework/README.md   | 81 +++++++++++++++++++
 .../nemo-framework/setup_nemo.sh              | 30 +++++++
 3 files changed, 150 insertions(+)
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md
 create mode 100644 examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh

diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile
new file mode 100644
index 0000000000..6d28904d63
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile
@@ -0,0 +1,39 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG NEMOFW_VERSION=24.07
+FROM nvcr.io/nvidia/nemo:$NEMOFW_VERSION
+
+ENV NCCL_DEBUG=INFO,WARN
+ENV NCCL_DEBUG_SUBSYS=NET,COLL,GPU,INIT,ENV
+ENV NCCL_SOCKET_IFNAME=enp0s19,enp192s20
+ENV GLOO_SOCKET_IFNAME=enp0s19,enp192s20
+ENV NCCL_CROSS_NIC=0
+ENV NCCL_NET_GDR_LEVEL=PIX
+#ENV PMIX_MCA_gds=^ds12
+ENV NCCL_P2P_NET_CHUNKSIZE=131072
+ENV NCCL_P2P_PCI_CHUNKSIZE=131072
+ENV NCCL_P2P_NVL_CHUNKSIZE=524288
+ENV NCCL_NVLS_CHUNKSIZE=524288
+ENV NCCL_IB_GID_INDEX=3
+ENV NCCL_IB_ADAPTIVE_ROUTING=1
+ENV NCCL_IB_QPS_PER_CONNECTION=4
+ENV NCCL_IB_TC=52
+ENV NCCL_IB_FIFO_TC=84
+ENV NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/usr/local/gib/configs/guest_config.txtpb
+ENV NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config.txtpb
+ENV NCCL_NET=gIB
+
+RUN echo "/usr/local/gib/lib64" >> /etc/ld.so.conf.d/gib.conf && ldconfig
+ENV LD_LIBRARY_PATH=/usr/local/gib/lib64:$LD_LIBRARY_PATH
diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md
new file mode 100644
index 0000000000..e895f43765
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md
@@ -0,0 +1,81 @@
+README
+======
+
+1. Set up NeMo Framework Container
+
+   This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
+   container, and submits a Slurm job to copy the framework launcher scripts and a
+   few other auxiliary files into your working directory.
+
+   ```shell
+   sbatch setup_nemo.sh
+   ```
+
+2. Install NeMo Framework Requirements
+
+   We suggest using a virtual environment, and this installs the necessary
+   components to submit jobs using the NeMo
+   framework.
+
+   ```shell
+   python3 -m venv env
+   source env/bin/activate
+   pip install -r requirements.txt # Copied from the NeMo Framework Container earlier
+   # This is needed to use 24.07 and python3.11, which is what is present on
+   # Debian 12
+   pip install -U hydra-core
+   ```
+
+3. Run an example NeMo Framework Pre-Training
+
+   First, prepare the cache. This will download several files to the
+   ~/.cache/huggingface folder which are needed to load the tokenizer for
+   training.
+
+   ```shell
+   pip install transformers
+   python -c "from transformers import AutoTokenizer; \
+       AutoTokenizer.from_pretrained('gpt2')"
+   ```
+
+   This will run an example of training a 5B parameter GPT3 model for 10 steps
+   using mock data as the input.
+
+   ```shell
+   cd launcher_scripts
+   mkdir data
+
+   MAX_STEPS=10
+   NUM_NODES=8
+
+   python main.py \
+       launcher_scripts_path=${PWD} \
+       stages=[training] \
+       training=gpt3/5b \
+       env_vars.TRANSFORMERS_OFFLINE=0 \
+       container=../nemo-24.07.sqsh \
+       container_mounts=[${HOME}/.cache,/usr/local/gib] \
+       cluster.srun_args=["--container-writable"] \
+       training.model.data.data_impl=mock \
+       training.model.data.data_prefix=[] \
+       training.trainer.max_steps=${MAX_STEPS} \
+       training.trainer.val_check_interval=${MAX_STEPS} \
+       training.trainer.limit_val_batches=0.0 \
+       training.exp_manager.create_checkpoint_callback=False \
+       training.exp_manager.resume_if_exists=False \
+       training.trainer.num_nodes=${NUM_NODES}
+   ```
+
+   This will submit a pre-training job to your Slurm cluster. Once it starts, you
+   will see results appearing in `results/gpt3_5b/`. For this example, the job
+   should only take a few minutes.
+
+Next Steps
+----------
+
+Now that you've run an example training workload, you may find it preferable to
+customize conf/cluster/bcm.yaml, conf/config.yaml, and the training
+configuration file of your choosing as opposed to using command line arguments.
+For real training workloads you'll also want to use real data, as opposed to
+the mock datasets used here, and explore all tuning and configurations
+parameters for your use case through the NeMo Framework.
diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh
new file mode 100644
index 0000000000..9fafa83e91
--- /dev/null
+++ b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=a3ultra
+#SBATCH --exclusive
+
+: "${NEMOFW_VERSION:=24.07}"
+
+srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemo-"${NEMOFW_VERSION}" .
+srun rm -f nemo-"${NEMOFW_VERSION}".sqsh
+srun enroot import dockerd://nemo-"${NEMOFW_VERSION}"
+
+srun \
+	--container-mounts="${PWD}":/workspace/mount_dir,/var/tmp:/var/tmp \
+	--container-image=./nemo-"${NEMOFW_VERSION}".sqsh \
+	bash -c "cp -r /opt/NeMo-Framework-Launcher/requirements.txt /opt/NeMo-Framework-Launcher/launcher_scripts /opt/NeMo-Framework-Launcher/auto_configurator /workspace/mount_dir/"

From 6a16f82db9f02e1848ea95522c3f0bed5d3b3f5d Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Mon, 13 Jan 2025 17:00:14 +0000
Subject: [PATCH 139/140] Increase version to v1.45.0

---
 cmd/root.go                                                   | 2 +-
 community/modules/compute/htcondor-execute-point/versions.tf  | 2 +-
 community/modules/compute/mig/versions.tf                     | 2 +-
 .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf  | 2 +-
 .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf      | 2 +-
 .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf  | 2 +-
 .../compute/schedmd-slurm-gcp-v6-partition/versions.tf        | 2 +-
 .../modules/database/slurm-cloudsql-federation/versions.tf    | 4 ++--
 .../modules/file-system/cloud-storage-bucket/versions.tf      | 4 ++--
 community/modules/file-system/nfs-server/versions.tf          | 2 +-
 community/modules/files/fsi-montecarlo-on-batch/versions.tf   | 4 ++--
 community/modules/network/private-service-access/versions.tf  | 4 ++--
 community/modules/project/service-enablement/versions.tf      | 2 +-
 community/modules/pubsub/bigquery-sub/versions.tf             | 4 ++--
 community/modules/pubsub/topic/versions.tf                    | 2 +-
 community/modules/scheduler/htcondor-access-point/versions.tf | 2 +-
 .../modules/scheduler/htcondor-central-manager/versions.tf    | 2 +-
 community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +-
 .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf     | 2 +-
 .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf  | 2 +-
 community/modules/scripts/wait-for-startup/versions.tf        | 2 +-
 community/modules/scripts/windows-startup-script/versions.tf  | 2 +-
 modules/compute/gke-node-pool/versions.tf                     | 2 +-
 modules/compute/vm-instance/versions.tf                       | 4 ++--
 modules/file-system/filestore/versions.tf                     | 4 ++--
 modules/file-system/gke-persistent-volume/versions.tf         | 2 +-
 modules/file-system/gke-storage/versions.tf                   | 2 +-
 modules/monitoring/dashboard/versions.tf                      | 2 +-
 modules/network/firewall-rules/versions.tf                    | 2 +-
 modules/network/pre-existing-subnetwork/versions.tf           | 2 +-
 modules/network/pre-existing-vpc/versions.tf                  | 2 +-
 modules/scheduler/batch-login-node/versions.tf                | 2 +-
 modules/scheduler/gke-cluster/versions.tf                     | 2 +-
 modules/scheduler/pre-existing-gke-cluster/versions.tf        | 2 +-
 modules/scripts/startup-script/versions.tf                    | 2 +-
 35 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/cmd/root.go b/cmd/root.go
index a5ccddfc8f..b6592a04f8 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
 				logging.Fatal("cmd.Help function failed: %s", err)
 			}
 		},
-		Version:     "v1.44.0",
+		Version:     "v1.45.0",
 		Annotations: annotation,
 	}
 )
diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf
index fdde0fbb7f..2614561ad7 100644
--- a/community/modules/compute/htcondor-execute-point/versions.tf
+++ b/community/modules/compute/htcondor-execute-point/versions.tf
@@ -25,6 +25,6 @@ terraform {
   }
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.45.0"
   }
 }
diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf
index c92b1250a9..574e2600f7 100644
--- a/community/modules/compute/mig/versions.tf
+++ b/community/modules/compute/mig/versions.tf
@@ -22,6 +22,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:mig/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:mig/v1.45.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
index 781ca820ee..3b6bff252a 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.45.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
index 361294d4d4..85116b6283 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
@@ -18,6 +18,6 @@ terraform {
   required_version = ">= 1.3"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.45.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
index 85ef700e96..65e20268ff 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.45.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
index 7c71c60486..67c9b14635 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
@@ -18,6 +18,6 @@ terraform {
   required_version = ">= 1.3"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.45.0"
   }
 }
diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf
index a958ad16a3..50463a0757 100644
--- a/community/modules/database/slurm-cloudsql-federation/versions.tf
+++ b/community/modules/database/slurm-cloudsql-federation/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.45.0"
   }
 
   required_version = ">= 0.13.0"
diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf
index fb28c00932..c206593a0e 100644
--- a/community/modules/file-system/cloud-storage-bucket/versions.tf
+++ b/community/modules/file-system/cloud-storage-bucket/versions.tf
@@ -30,10 +30,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.45.0"
   }
   required_version = ">= 0.14.0"
 }
diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf
index 2b8a1a8c21..09ce5a3b00 100644
--- a/community/modules/file-system/nfs-server/versions.tf
+++ b/community/modules/file-system/nfs-server/versions.tf
@@ -30,7 +30,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf
index 5ba66bd4c5..60646db263 100644
--- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf
+++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf
@@ -35,9 +35,9 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.45.0"
   }
 }
diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf
index c62b26c5e4..4f99beb99d 100644
--- a/community/modules/network/private-service-access/versions.tf
+++ b/community/modules/network/private-service-access/versions.tf
@@ -30,11 +30,11 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.45.0"
   }
 
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.45.0"
   }
 
   required_version = ">= 1.2"
diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf
index 13c5bf1ebb..4e28f5ba15 100644
--- a/community/modules/project/service-enablement/versions.tf
+++ b/community/modules/project/service-enablement/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf
index 1a970d6548..3c2c029843 100644
--- a/community/modules/pubsub/bigquery-sub/versions.tf
+++ b/community/modules/pubsub/bigquery-sub/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.45.0"
   }
   required_version = ">= 1.0"
 }
diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf
index 8d90e535ba..07ac7930fe 100644
--- a/community/modules/pubsub/topic/versions.tf
+++ b/community/modules/pubsub/topic/versions.tf
@@ -27,6 +27,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:topic/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:topic/v1.45.0"
   }
 }
diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf
index e2e4536d02..a53e19d0a2 100644
--- a/community/modules/scheduler/htcondor-access-point/versions.tf
+++ b/community/modules/scheduler/htcondor-access-point/versions.tf
@@ -26,7 +26,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.45.0"
   }
 
   required_version = ">= 1.1"
diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf
index 082221449b..6598d71a06 100644
--- a/community/modules/scheduler/htcondor-central-manager/versions.tf
+++ b/community/modules/scheduler/htcondor-central-manager/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.45.0"
   }
 
   required_version = ">= 1.1.0"
diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf
index e8255b5c76..750cdcff49 100644
--- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf
+++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf
@@ -26,7 +26,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.45.0"
   }
 
   required_version = ">= 1.3.0"
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
index 10ba69beae..009b52931e 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.45.0"
   }
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
index cb3dca1bc2..0c6745b638 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.45.0"
   }
 }
diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf
index c5c429481f..3d4a9f3f7b 100644
--- a/community/modules/scripts/wait-for-startup/versions.tf
+++ b/community/modules/scripts/wait-for-startup/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf
index 1e592099f1..88f8bf6981 100644
--- a/community/modules/scripts/windows-startup-script/versions.tf
+++ b/community/modules/scripts/windows-startup-script/versions.tf
@@ -16,7 +16,7 @@
 
 terraform {
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf
index fcbc8e2f94..69dec70d03 100644
--- a/modules/compute/gke-node-pool/versions.tf
+++ b/modules/compute/gke-node-pool/versions.tf
@@ -30,6 +30,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.45.0"
   }
 }
diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf
index e2d709d999..96b63aea63 100644
--- a/modules/compute/vm-instance/versions.tf
+++ b/modules/compute/vm-instance/versions.tf
@@ -31,10 +31,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.45.0"
   }
 
   required_version = ">= 1.3.0"
diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf
index 6a422772a3..5668490b25 100644
--- a/modules/file-system/filestore/versions.tf
+++ b/modules/file-system/filestore/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.45.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf
index 511947a1f8..3fd6cf3c3a 100644
--- a/modules/file-system/gke-persistent-volume/versions.tf
+++ b/modules/file-system/gke-persistent-volume/versions.tf
@@ -25,6 +25,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.45.0"
   }
 }
diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf
index bfb6a565c3..0b0dc1686e 100644
--- a/modules/file-system/gke-storage/versions.tf
+++ b/modules/file-system/gke-storage/versions.tf
@@ -16,6 +16,6 @@ terraform {
   required_version = ">= 1.0"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.45.0"
   }
 }
diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf
index ad1eb07abf..1ab4e475c0 100644
--- a/modules/monitoring/dashboard/versions.tf
+++ b/modules/monitoring/dashboard/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf
index fefc244b8f..e1774dc14d 100644
--- a/modules/network/firewall-rules/versions.tf
+++ b/modules/network/firewall-rules/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.45.0"
   }
 
   required_version = ">= 1.3"
diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf
index bed38076f2..01e1455233 100644
--- a/modules/network/pre-existing-subnetwork/versions.tf
+++ b/modules/network/pre-existing-subnetwork/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf
index 00e8dcf8bc..a1e5318ab6 100644
--- a/modules/network/pre-existing-vpc/versions.tf
+++ b/modules/network/pre-existing-vpc/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf
index 16c1b02494..7ba312a569 100644
--- a/modules/scheduler/batch-login-node/versions.tf
+++ b/modules/scheduler/batch-login-node/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.45.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf
index 1f327efbe8..e9d5e8f4ff 100644
--- a/modules/scheduler/gke-cluster/versions.tf
+++ b/modules/scheduler/gke-cluster/versions.tf
@@ -30,6 +30,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.45.0"
   }
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf
index e1a9453e83..950863276c 100644
--- a/modules/scheduler/pre-existing-gke-cluster/versions.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf
@@ -23,7 +23,7 @@ terraform {
   }
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.45.0"
   }
 
   required_version = ">= 1.3"
diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf
index 90b1dd202c..7620b150bb 100644
--- a/modules/scripts/startup-script/versions.tf
+++ b/modules/scripts/startup-script/versions.tf
@@ -30,7 +30,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.44.0"
+    module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.45.0"
   }
 
   required_version = ">= 1.3"

From 935189676f0e084baa0d7b2935708cde0efbf6c8 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Mon, 13 Jan 2025 20:47:27 +0000
Subject: [PATCH 140/140] Update OFE virtualenv to address CVE-2024-53899

---
 community/front-end/ofe/requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt
index efaabed2a5..f536c49920 100644
--- a/community/front-end/ofe/requirements.txt
+++ b/community/front-end/ofe/requirements.txt
@@ -16,7 +16,7 @@ cryptography==43.0.1
 decorator==5.1.1
 defusedxml==0.7.1
 dill==0.3.6
-distlib==0.3.6
+distlib==0.3.7
 # django-revproxy==0.11.0 released but not yet in pypi
 git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787
 Django==4.2.17
@@ -59,7 +59,7 @@ nodeenv==1.8.0
 oauthlib==3.2.2
 path==16.7.1
 pkgutil_resolve_name==1.3.10
-platformdirs==3.8.0
+platformdirs==3.9.1
 pre-commit==3.3.3
 proto-plus==1.22.3
 protobuf==4.23.3
@@ -94,7 +94,7 @@ typing_extensions==4.6.3
 uritemplate==4.1.1
 urllib3==1.26.19
 uvicorn==0.22.0
-virtualenv==20.23.1
+virtualenv==20.28.1
 wrapt==1.15.0
 xmltodict==0.13.0
 yq==3.2.2