Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia): build pytorch to get older cuda compute capabilities and setup arm64 support #578

Merged
merged 1 commit into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training
- run: |
docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
build-image-nvidia-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference
- run: |
docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious... Any reason for choosing a value of 8 for MAX_JOBS?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea this was just manually tuned based on OOM errors for MAX_JOBS being too high and searched for a value that passed in under the 6hr default gh-action limit

build-image-neuron-training:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion internal/deployers/eksapi/kubeconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package eksapi

import (
"bytes"
"fmt"
"os"
"text/template"
"fmt"

"k8s.io/klog"
)
Expand Down
6 changes: 4 additions & 2 deletions test/cases/nvidia-training/bert_training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ func TestBertTraining(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20))
wait.WithTimeout(time.Minute*20),
wait.WithContext(ctx),
)
if err != nil {
t.Fatal(err)
t.Error(err)
}

err = printJobLogs(ctx, cfg, "default", "bert-training-launcher")
Expand Down
13 changes: 6 additions & 7 deletions test/cases/nvidia-training/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"log"
"os"
"os/signal"
"slices"
"testing"
"time"
Expand Down Expand Up @@ -37,7 +38,10 @@ func TestMain(m *testing.M) {
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)

ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
testenv = env.NewWithConfig(cfg).WithContext(ctx)

manifests := [][]byte{
nvidiaDevicePluginManifest,
Expand Down Expand Up @@ -147,16 +151,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
return ctx, fmt.Errorf("no nodes found in the cluster")
}

singleNodeType := true
for i := 1; i < len(nodes.Items); i++ {
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
singleNodeType = false
break
return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
}
}
if !singleNodeType {
return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster")
}

if *nodeType != "" {
count := 0
Expand Down
12 changes: 5 additions & 7 deletions test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import (
"fmt"
"log"
"os"
"os/signal"
"slices"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/internal/e2e"
"github.com/aws/aws-sdk-go-v2/aws"
Expand All @@ -31,6 +31,7 @@ var (
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
pytorchImage *string
skipUnitTestSubcommand *string
nodeCount int
gpuPerNode int
Expand Down Expand Up @@ -99,15 +100,11 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
return ctx, err
}

singleNodeType := true
for i := 1; i < len(nodes.Items)-1; i++ {
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
singleNodeType = false
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
}
}
if !singleNodeType {
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
}

if *nodeType != "" {
for _, v := range nodes.Items {
Expand Down Expand Up @@ -135,6 +132,7 @@ func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Contex
func TestMain(m *testing.M) {
nodeType = flag.String("nodeType", "", "node type for the tests")
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
pytorchImage = flag.String("pytorchImage", "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2", "pytorch cuda image for single node tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
skipUnitTestSubcommand = flag.String("skipUnitTestSubcommand", "", "optional command to skip specified unit test, `-s test1|test2|...`")
Expand All @@ -143,7 +141,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
6 changes: 6 additions & 0 deletions test/cases/nvidia/manifests/job-hpc-benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,20 @@ spec:
- HPL_FCT_COMM_POLICY=1
- -x
- HPL_USE_NVSHMEM=0
# TODO: for arm it will be
# - hpl-aarch64.sh
- hpl.sh
- --mem-affinity
- 0:0:0:0:1:1:1:1
# --cpu-affinity needs to be tuned depending on the number of CPUs
# available on the instance type.
- --cpu-affinity
- 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111
- --no-multinode
- --dat
- hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat
# TODO: the path differs for arm64
# - hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat
volumeMounts:
- mountPath: /dev/shm
name: dshm
Expand Down
10 changes: 7 additions & 3 deletions test/cases/nvidia/manifests/job-unit-test-single-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ spec:
- /bin/bash
- ./gpu_unit_tests/unit_test
env:
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestSubcommand}}
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestSubcommand}}
# because we started building these from source, this is just a
# regular binary.
- name: DEMO_SUITE_DIR
value: /usr/bin
imagePullPolicy: Always
resources:
limits:
Expand All @@ -29,4 +33,4 @@ spec:
cpu: "1"
memory: 1Gi
restartPolicy: Never
backoffLimit: 4
backoffLimit: 4
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
spec:
restartPolicy: OnFailure
containers:
- image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2
- image: {{.PytorchTestImage}}
name: gpu-test
command:
- mpirun
Expand Down Expand Up @@ -48,7 +48,7 @@ spec:
- MXNET_CUDNN_AUTOTUNE_DEFAULT=0
- python
- -c
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1")
- import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1")
resources:
limits:
nvidia.com/gpu: 1
10 changes: 9 additions & 1 deletion test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,15 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
t.Log("Applying single node manifest")
err := fwext.ApplyManifests(cfg.Client().RESTConfig(), mpiJobPytorchTrainingSingleNodeManifest)
renderedSingleNodeManifest, err := fwext.RenderManifests(mpiJobPytorchTrainingSingleNodeManifest, struct {
PytorchTestImage string
}{
PytorchTestImage: *pytorchImage,
})
if err != nil {
t.Fatal(err)
}
err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
Expand Down
8 changes: 4 additions & 4 deletions test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
})
if err != nil {
t.Fatal(err)
t.Error(err)
}
t.Log("Test log for unit-test-job:")
t.Log(log)
err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
t.Error(err)
}
return ctx
}).
Expand Down Expand Up @@ -120,13 +120,13 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"},
})
if err != nil {
t.Fatal(err)
t.Error(err)
}
t.Log("Test log for hpc-benckmarks-job:")
t.Log(log)
err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobHpcBenchmarksSingleNodeManifest)
if err != nil {
t.Fatal(err)
t.Error(err)
}
return ctx
}).
Expand Down
20 changes: 20 additions & 0 deletions test/images/nvidia-inference/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8

ARG PYTORCH_BRANCH=v2.5.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

###############################################################################
# 1) System packages
###############################################################################
Expand Down Expand Up @@ -75,3 +78,20 @@ WORKDIR /app
COPY infer.py /app/
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

###############################################################################
# 4) Install Pytorch from Source
###############################################################################
# envs needed to make the path of NVCC known to the compilation
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
# this list could be minimized based on the supported GPUs
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip3 install typing-extensions sympy
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch
Comment on lines +82 to +97
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any idea how long this step takes? Just curious

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the workflow took ~5hr 30mins, which the bulk of is here 😅

1 change: 0 additions & 1 deletion test/images/nvidia-inference/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.5
transformers==4.33
numpy==1.26
26 changes: 20 additions & 6 deletions test/images/nvidia-training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ ENV DEBIAN_FRONTEND=noninteractive
# Set default values for MASTER_ADDR, MASTER_PORT, and NUM_GPUS_PER_NODE
ENV MASTER_ADDR=127.0.0.1
ENV MASTER_PORT=12355
ENV NUM_GPUS_PER_NODE=8

# Python dependency version numbers
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3

ARG PYTORCH_BRANCH=v2.3.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

RUN apt-get update \
&& apt-get upgrade -y \
Expand Down Expand Up @@ -58,10 +59,23 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
&& pip --no-cache-dir install --upgrade \
pip \
setuptools

# Install Pytorch from Source
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip install typing-extensions sympy pyyaml
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch

# Set the working directory in the container
WORKDIR /app

Expand All @@ -74,7 +88,7 @@ RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

ARG EFA_INSTALLER_VERSION=latest
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG AWS_OFI_NCCL_VERSION=1.13.2
ARG NCCL_TESTS_VERSION=master

RUN apt-get update -y && \
Expand All @@ -94,7 +108,7 @@ RUN mkdir -p /var/run/sshd && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set environment variables for OpenMPI, CUDA, EFA, and NCCL
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
Expand All @@ -107,7 +121,7 @@ RUN cd $HOME \

# Install NCCL (version specified)
RUN apt-key del 7fa2af80 && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(uname -m | sed 's/aarch64/sbsa/')/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

Expand Down
5 changes: 2 additions & 3 deletions test/images/nvidia-training/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.3
transformers==4.29
numpy==1.23
transformers==4.33
numpy==1.26
3 changes: 1 addition & 2 deletions test/images/nvidia-training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ def main():
# Retrieve environment variables
rank = int(os.getenv("OMPI_COMM_WORLD_RANK", "0"))
world_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1"))
num_gpus_per_node = int(os.getenv("NUM_GPUS_PER_NODE", "8"))
local_rank = rank % num_gpus_per_node
local_rank = int(os.getenv("OMPI_COMM_WORLD_LOCAL_RANK", "0"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


print(f"Process started for rank {rank} with local rank {local_rank}")

Expand Down
Loading