feat(nvidia): build pytorch to get older cuda compute capabilities an…

…d setup arm64 support
aws · Feb 5, 2025 · dfda945 · dfda945
1 parent b5a9e87
commit dfda945
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 11 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -35,12 +35,16 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training
+    - run: |
+        docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
+          --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
   build-image-nvidia-inference:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference
+    - run: |
+        docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
+          --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
   build-image-neuron-training:
     runs-on: ubuntu-latest
     steps:

diff --git a/test/cases/nvidia/manifests/job-unit-test-single-node.yaml b/test/cases/nvidia/manifests/job-unit-test-single-node.yaml
@@ -29,4 +29,4 @@ spec:
             cpu: "1"
             memory: 1Gi
       restartPolicy: Never
-  backoffLimit: 4
+  backoffLimit: 4
diff --git a/test/images/nvidia-inference/Dockerfile b/test/images/nvidia-inference/Dockerfile
@@ -16,6 +16,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
     LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
+ARG PYTORCH_BRANCH=v2.5.0
+ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"
+
 ###############################################################################
 # 1) System packages
 ###############################################################################
@@ -75,3 +78,20 @@ WORKDIR /app
 COPY infer.py /app/
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
+
+###############################################################################
+# 4) Install Pytorch from Source
+###############################################################################
+# envs needed to make the path of NVCC known to the compilation
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
+ENV PATH=$PATH:$CUDA_HOME/bin
+# this list could be minimized based on the supported GPUs
+ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"
+
+RUN pip3 install typing-extensions sympy
+RUN git clone \
+      --recursive https://github.com/pytorch/pytorch.git \
+      --branch $PYTORCH_BRANCH \
+ && cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
+ && rm -rf pytorch
diff --git a/test/images/nvidia-inference/requirements.txt b/test/images/nvidia-inference/requirements.txt
@@ -1,3 +1,2 @@
-torch==2.5
 transformers==4.33
 numpy==1.26
diff --git a/test/images/nvidia-training/Dockerfile b/test/images/nvidia-training/Dockerfile
@@ -12,7 +12,9 @@ ENV NUM_GPUS_PER_NODE=8
 # Python dependency version numbers
 ARG PYTHON=python3.10
 ARG PYTHON_VERSION=3.10.12
-ARG PIP=pip3
+
+ARG PYTORCH_BRANCH=v2.3.0
+ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -58,10 +60,23 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
  && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
  && ln -s /usr/local/bin/pip3 /usr/bin/pip \
  && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
- && ${PIP} --no-cache-dir install --upgrade \
+ && pip --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Install Pytorch from Source
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
+ENV PATH=$PATH:$CUDA_HOME/bin
+ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"
+
+RUN pip install typing-extensions sympy pyyaml
+RUN git clone \
+      --recursive https://github.com/pytorch/pytorch.git \
+      --branch $PYTORCH_BRANCH \
+ && cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
+ && rm -rf pytorch
+
 # Set the working directory in the container
 WORKDIR /app
 
@@ -74,7 +89,7 @@ RUN python -m pip install --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 
 ARG EFA_INSTALLER_VERSION=latest
-ARG AWS_OFI_NCCL_VERSION=1.9.1
+ARG AWS_OFI_NCCL_VERSION=1.10.0-aws
 ARG NCCL_TESTS_VERSION=master
 
 RUN apt-get update -y && \
@@ -94,7 +109,7 @@ RUN mkdir -p /var/run/sshd && \
     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
 
 # Set environment variables for OpenMPI, CUDA, EFA, and NCCL
-ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
 ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
 
 # Install EFA
@@ -107,7 +122,7 @@ RUN cd $HOME \
 
 # Install NCCL (version specified)
 RUN apt-key del 7fa2af80 && \
-    curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+    curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(uname -m)/cuda-keyring_1.0-1_all.deb && \
     dpkg -i cuda-keyring_1.0-1_all.deb && \
     sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
 

diff --git a/test/images/nvidia-training/requirements.txt b/test/images/nvidia-training/requirements.txt
@@ -1,3 +1,2 @@
-torch==2.3
 transformers==4.29
-numpy==1.23
+numpy==1.23