aws · ndbaker1 · Feb 17, 2025 · Jan 29, 2025 · mattcjo · Feb 6, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -35,12 +35,16 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training
+    - run: |
+        docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
+          --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
   build-image-nvidia-inference:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference
+    - run: |
+        docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
+          --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
   build-image-neuron-training:
     runs-on: ubuntu-latest
     steps:

diff --git a/internal/deployers/eksapi/kubeconfig.go b/internal/deployers/eksapi/kubeconfig.go
@@ -2,9 +2,9 @@ package eksapi
 
 import (
 	"bytes"
+	"fmt"
 	"os"
 	"text/template"
-	"fmt"
 
 	"k8s.io/klog"
 )

diff --git a/test/cases/nvidia-training/bert_training_test.go b/test/cases/nvidia-training/bert_training_test.go
@@ -69,9 +69,11 @@ func TestBertTraining(t *testing.T) {
 				ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"},
 			}
 			err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
-				wait.WithTimeout(time.Minute*20))
+				wait.WithTimeout(time.Minute*20),
+				wait.WithContext(ctx),
+			)
 			if err != nil {
-				t.Fatal(err)
+				t.Error(err)
 			}
 
 			err = printJobLogs(ctx, cfg, "default", "bert-training-launcher")

diff --git a/test/cases/nvidia-training/main_test.go b/test/cases/nvidia-training/main_test.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"log"
 	"os"
+	"os/signal"
 	"slices"
 	"testing"
 	"time"
@@ -37,7 +38,10 @@ func TestMain(m *testing.M) {
 	if err != nil {
 		log.Fatalf("failed to initialize test environment: %v", err)
 	}
-	testenv = env.NewWithConfig(cfg)
+
+	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
+	defer cancel()
+	testenv = env.NewWithConfig(cfg).WithContext(ctx)
 
 	manifests := [][]byte{
 		nvidiaDevicePluginManifest,
@@ -147,16 +151,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
 		return ctx, fmt.Errorf("no nodes found in the cluster")
 	}
 
-	singleNodeType := true
 	for i := 1; i < len(nodes.Items); i++ {
 		if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
-			singleNodeType = false
-			break
+			return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
 		}
 	}
-	if !singleNodeType {
-		return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
-	}
 
 	if *nodeType != "" {
 		count := 0

diff --git a/test/cases/nvidia/main_test.go b/test/cases/nvidia/main_test.go
@@ -9,9 +9,9 @@ import (
 	"fmt"
 	"log"
 	"os"
+	"os/signal"
 	"slices"
 	"testing"
-	"time"
 
 	fwext "github.com/aws/aws-k8s-tester/internal/e2e"
 	"github.com/aws/aws-sdk-go-v2/aws"
@@ -31,6 +31,7 @@ var (
 	installDevicePlugin    *bool
 	efaEnabled             *bool
 	nvidiaTestImage        *string
+	pytorchImage           *string
 	skipUnitTestSubcommand *string
 	nodeCount              int
 	gpuPerNode             int
@@ -99,15 +100,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
 		return ctx, err
 	}
 
-	singleNodeType := true
 	for i := 1; i < len(nodes.Items)-1; i++ {
 		if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
-			singleNodeType = false
+			return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
 		}
 	}
-	if !singleNodeType {
-		return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
-	}
 
 	if *nodeType != "" {
 		for _, v := range nodes.Items {
@@ -135,6 +132,7 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
 func TestMain(m *testing.M) {
 	nodeType = flag.String("nodeType", "", "node type for the tests")
 	nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
+	pytorchImage = flag.String("pytorchImage", "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2", "pytorch cuda image for single node tests")
 	efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
 	installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
 	skipUnitTestSubcommand = flag.String("skipUnitTestSubcommand", "", "optional command to skip specified unit test, `-s test1|test2|...`")
@@ -143,7 +141,7 @@ func TestMain(m *testing.M) {
 		log.Fatalf("failed to initialize test environment: %v", err)
 	}
 	testenv = env.NewWithConfig(cfg)
-	ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
+	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
 	defer cancel()
 	testenv = testenv.WithContext(ctx)
 

diff --git a/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml b/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml
@@ -32,14 +32,20 @@ spec:
         - HPL_FCT_COMM_POLICY=1 
         - -x 
         - HPL_USE_NVSHMEM=0
+        # TODO: for arm it will be
+        # - hpl-aarch64.sh
         - hpl.sh 
         - --mem-affinity 
         - 0:0:0:0:1:1:1:1 
+        # --cpu-affinity needs to be tuned depending on the number of CPUs
+        # available on the instance type.
         - --cpu-affinity 
         - 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111
         - --no-multinode 
         - --dat 
         - hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat
+        # TODO: the path differs for arm64
+        # - hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat
         volumeMounts:
         - mountPath: /dev/shm
           name: dshm

diff --git a/test/cases/nvidia/manifests/job-unit-test-single-node.yaml b/test/cases/nvidia/manifests/job-unit-test-single-node.yaml
@@ -17,8 +17,12 @@ spec:
         - /bin/bash
         - ./gpu_unit_tests/unit_test
         env:
-            - name: SKIP_TESTS_SUBCOMMAND
-              value: {{.SkipTestSubcommand}}
+          - name: SKIP_TESTS_SUBCOMMAND
+            value: {{.SkipTestSubcommand}}
+          # because we started building these from source, this is just a
+          # regular binary.
+          - name: DEMO_SUITE_DIR
+            value: /usr/bin
         imagePullPolicy: Always
         resources:
           limits:
@@ -29,4 +33,4 @@ spec:
             cpu: "1"
             memory: 1Gi
       restartPolicy: Never
-  backoffLimit: 4
+  backoffLimit: 4
diff --git a/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml b/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml
@@ -16,7 +16,7 @@ spec:
          spec:
            restartPolicy: OnFailure
            containers:
-           - image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2
+           - image: {{.PytorchTestImage}}
              name: gpu-test
              command:
               - mpirun
@@ -48,7 +48,7 @@ spec:
               - MXNET_CUDNN_AUTOTUNE_DEFAULT=0
               - python
               - -c
-              - import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1")
+              - import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1")
              resources:
                limits:
                  nvidia.com/gpu: 1
diff --git a/test/cases/nvidia/mpi_test.go b/test/cases/nvidia/mpi_test.go
@@ -48,7 +48,15 @@ func TestMPIJobPytorchTraining(t *testing.T) {
 		WithLabel("hardware", "gpu").
 		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 			t.Log("Applying single node manifest")
-			err := fwext.ApplyManifests(cfg.Client().RESTConfig(), mpiJobPytorchTrainingSingleNodeManifest)
+			renderedSingleNodeManifest, err := fwext.RenderManifests(mpiJobPytorchTrainingSingleNodeManifest, struct {
+				PytorchTestImage string
+			}{
+				PytorchTestImage: *pytorchImage,
+			})
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedSingleNodeManifest)
 			if err != nil {
 				t.Fatal(err)
 			}

diff --git a/test/cases/nvidia/unit_test.go b/test/cases/nvidia/unit_test.go
@@ -75,13 +75,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
 				ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
 			})
 			if err != nil {
-				t.Fatal(err)
+				t.Error(err)
 			}
 			t.Log("Test log for unit-test-job:")
 			t.Log(log)
 			err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
 			if err != nil {
-				t.Fatal(err)
+				t.Error(err)
 			}
 			return ctx
 		}).
@@ -120,13 +120,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
 				ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"},
 			})
 			if err != nil {
-				t.Fatal(err)
+				t.Error(err)
 			}
 			t.Log("Test log for hpc-benckmarks-job:")
 			t.Log(log)
 			err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobHpcBenchmarksSingleNodeManifest)
 			if err != nil {
-				t.Fatal(err)
+				t.Error(err)
 			}
 			return ctx
 		}).

diff --git a/test/images/nvidia-inference/Dockerfile b/test/images/nvidia-inference/Dockerfile
@@ -16,6 +16,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
     LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
+ARG PYTORCH_BRANCH=v2.5.0
+ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"
+
 ###############################################################################
 # 1) System packages
 ###############################################################################
@@ -75,3 +78,20 @@ WORKDIR /app
 COPY infer.py /app/
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
+
+###############################################################################
+# 4) Install Pytorch from Source
+###############################################################################
+# envs needed to make the path of NVCC known to the compilation
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
+ENV PATH=$PATH:$CUDA_HOME/bin
+# this list could be minimized based on the supported GPUs
+ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"
+
+RUN pip3 install typing-extensions sympy
+RUN git clone \
+      --recursive https://github.com/pytorch/pytorch.git \
+      --branch $PYTORCH_BRANCH \
+ && cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
+ && rm -rf pytorch
diff --git a/test/images/nvidia-inference/requirements.txt b/test/images/nvidia-inference/requirements.txt
@@ -1,3 +1,2 @@
-torch==2.5
 transformers==4.33
 numpy==1.26
diff --git a/test/images/nvidia-training/Dockerfile b/test/images/nvidia-training/Dockerfile
@@ -7,12 +7,13 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Set default values for MASTER_ADDR, MASTER_PORT, and NUM_GPUS_PER_NODE
 ENV MASTER_ADDR=127.0.0.1
 ENV MASTER_PORT=12355
-ENV NUM_GPUS_PER_NODE=8
 
 # Python dependency version numbers
 ARG PYTHON=python3.10
 ARG PYTHON_VERSION=3.10.12
-ARG PIP=pip3
+
+ARG PYTORCH_BRANCH=v2.3.0
+ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -58,10 +59,23 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
  && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
  && ln -s /usr/local/bin/pip3 /usr/bin/pip \
  && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
- && ${PIP} --no-cache-dir install --upgrade \
+ && pip --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Install Pytorch from Source
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
+ENV PATH=$PATH:$CUDA_HOME/bin
+ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"
+
+RUN pip install typing-extensions sympy pyyaml
+RUN git clone \
+      --recursive https://github.com/pytorch/pytorch.git \
+      --branch $PYTORCH_BRANCH \
+ && cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
+ && rm -rf pytorch
+
 # Set the working directory in the container
 WORKDIR /app
 
@@ -74,7 +88,7 @@ RUN python -m pip install --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 
 ARG EFA_INSTALLER_VERSION=latest
-ARG AWS_OFI_NCCL_VERSION=1.9.1
+ARG AWS_OFI_NCCL_VERSION=1.13.2
 ARG NCCL_TESTS_VERSION=master
 
 RUN apt-get update -y && \
@@ -94,7 +108,7 @@ RUN mkdir -p /var/run/sshd && \
     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
 
 # Set environment variables for OpenMPI, CUDA, EFA, and NCCL
-ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
 ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
 
 # Install EFA
@@ -107,7 +121,7 @@ RUN cd $HOME \
 
 # Install NCCL (version specified)
 RUN apt-key del 7fa2af80 && \
-    curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+    curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(uname -m | sed 's/aarch64/sbsa/')/cuda-keyring_1.0-1_all.deb && \
     dpkg -i cuda-keyring_1.0-1_all.deb && \
     sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
 

diff --git a/test/images/nvidia-training/requirements.txt b/test/images/nvidia-training/requirements.txt
@@ -1,3 +1,2 @@
-torch==2.3
-transformers==4.29
-numpy==1.23
+transformers==4.33
+numpy==1.26
diff --git a/test/images/nvidia-training/train.py b/test/images/nvidia-training/train.py
@@ -110,8 +110,7 @@ def main():
     # Retrieve environment variables
     rank = int(os.getenv("OMPI_COMM_WORLD_RANK", "0"))
     world_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1"))
-    num_gpus_per_node = int(os.getenv("NUM_GPUS_PER_NODE", "8"))
-    local_rank = rank % num_gpus_per_node
+    local_rank = int(os.getenv("OMPI_COMM_WORLD_LOCAL_RANK", "0"))
 
     print(f"Process started for rank {rank} with local rank {local_rank}")
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,9 +2,9 @@ package eksapi @@
     import (
     	"bytes"
+    	"fmt"
     	"os"
     	"text/template"
-    	"fmt"
     	"k8s.io/klog"
     )
@@ Expand Down @@