Skip to content

Commit

Permalink
build(nvidia): support arm64 builds (#2137)
Browse files Browse the repository at this point in the history
  • Loading branch information
ndbaker1 authored Feb 13, 2025
1 parent 78f54f6 commit 2ffb0d5
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 5 deletions.
1 change: 1 addition & 0 deletions doc/usage/al2023.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
| `launch_block_device_mappings_volume_size` | |
| `nodeadm_build_image` | Image to use as a build environment for nodeadm |
| `nvidia_driver_major_version` | To be used only when ```enable_accelerator = nvidia```. Driver version to install, depends on what is available in NVIDIA repository. |
| `nvidia_repository_url` | YUM/DNF Repository override for the NVIDIA driver packages |
| `pause_container_image` | Image ref for the pause container image |
| `remote_folder` | Directory path for shell provisioner scripts on the builder instance |
| `runc_version` | |
Expand Down
2 changes: 1 addition & 1 deletion nodeadm/internal/containerd/runtime_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func (m *instanceTypeMixin) matches(instanceType string) bool {

var (
// TODO: fetch this list dynamically
nvidiaInstances = []string{"p3", "p3dn", "p4d", "p4de", "p5", "p5e", "p5en", "g4", "g4dn", "g5", "g6", "g6e"}
nvidiaInstances = []string{"p3", "p3dn", "p4d", "p4de", "p5", "p5e", "p5en", "g4", "g4dn", "g5", "g6", "g6e", "g5g"}
NvidiaInstanceTypeMixin = instanceTypeMixin{
instanceFamilies: nvidiaInstances,
apply: applyNvidia,
Expand Down
12 changes: 8 additions & 4 deletions templates/al2023/provisioners/install-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,14 @@ else
DOMAIN="nvidia.com"
fi

sudo dnf config-manager --add-repo https://developer.download.${DOMAIN}/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo
if [ -n "${NVIDIA_REPOSITORY:-}" ]; then
sudo dnf config-manager --add-repo ${NVIDIA_REPOSITORY}
else
sudo dnf config-manager --add-repo https://developer.download.${DOMAIN}/compute/cuda/repos/amzn2023/$(uname -m)/cuda-amzn2023.repo
fi
sudo dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo

sudo sed -i 's/gpgcheck=0/gpgcheck=1/g' /etc/yum.repos.d/nvidia-container-toolkit.repo /etc/yum.repos.d/cuda-amzn2023.repo
# update all current .repo sources to enable gpgcheck
sudo dnf config-manager --save --setopt=*.gpgcheck=1
fi

################################################################################
Expand All @@ -62,7 +66,7 @@ sudo mv ${WORKING_DIR}/gpu/kmod-util /usr/bin/

sudo mkdir -p /etc/dkms
echo "MAKE[0]=\"'make' -j$(grep -c processor /proc/cpuinfo) module\"" | sudo tee /etc/dkms/nvidia.conf
sudo dnf -y install kernel-modules-extra.x86_64
sudo dnf -y install kernel-modules-extra

function archive-open-kmods() {
if is-isolated-partition; then
Expand Down
2 changes: 2 additions & 0 deletions templates/al2023/template.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"launch_block_device_mappings_volume_size": null,
"nodeadm_build_image": null,
"nvidia_driver_major_version": null,
"nvidia_repository_url": null,
"pause_container_image": null,
"remote_folder": null,
"runc_version": null,
Expand Down Expand Up @@ -261,6 +262,7 @@
"BINARY_BUCKET_NAME={{user `binary_bucket_name`}}",
"BINARY_BUCKET_REGION={{user `binary_bucket_region`}}",
"NVIDIA_DRIVER_MAJOR_VERSION={{user `nvidia_driver_major_version`}}",
"NVIDIA_REPOSITORY={{user `nvidia_repository_url`}}",
"WORKING_DIR={{user `working_dir`}}"
]
},
Expand Down
1 change: 1 addition & 0 deletions templates/al2023/variables-default.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"launch_block_device_mappings_volume_size": "20",
"nodeadm_build_image": "public.ecr.aws/eks-distro-build-tooling/golang:1.23",
"nvidia_driver_major_version": "560",
"nvidia_repository_url": null,
"pause_container_image": "602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/pause:3.10",
"remote_folder": "/tmp",
"runc_version": "*",
Expand Down

0 comments on commit 2ffb0d5

Please sign in to comment.