Skip to content

Commit

Permalink
feat(nvidia): build pytorch to get older cuda compute capabilities an…
Browse files Browse the repository at this point in the history
…d setup arm64 support
  • Loading branch information
ndbaker1 committed Feb 5, 2025
1 parent b5a9e87 commit dfda945
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 11 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training
- run: |
docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
build-image-nvidia-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference
- run: |
docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
--build-arg PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 USE_DISTRIBUTED=0"
build-image-neuron-training:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion test/cases/nvidia/manifests/job-unit-test-single-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ spec:
cpu: "1"
memory: 1Gi
restartPolicy: Never
backoffLimit: 4
backoffLimit: 4
20 changes: 20 additions & 0 deletions test/images/nvidia-inference/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8

ARG PYTORCH_BRANCH=v2.5.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

###############################################################################
# 1) System packages
###############################################################################
Expand Down Expand Up @@ -75,3 +78,20 @@ WORKDIR /app
COPY infer.py /app/
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt

###############################################################################
# 4) Install Pytorch from Source
###############################################################################
# envs needed to make the path of NVCC known to the compilation
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
# this list could be minimized based on the supported GPUs
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip3 install typing-extensions sympy
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch
1 change: 0 additions & 1 deletion test/images/nvidia-inference/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.5
transformers==4.33
numpy==1.26
25 changes: 20 additions & 5 deletions test/images/nvidia-training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ ENV NUM_GPUS_PER_NODE=8
# Python dependency version numbers
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3

ARG PYTORCH_BRANCH=v2.3.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

RUN apt-get update \
&& apt-get upgrade -y \
Expand Down Expand Up @@ -58,10 +60,23 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
&& pip --no-cache-dir install --upgrade \
pip \
setuptools

# Install Pytorch from Source
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.7 8.9 9.0"

RUN pip install typing-extensions sympy pyyaml
RUN git clone \
--recursive https://github.com/pytorch/pytorch.git \
--branch $PYTORCH_BRANCH \
&& cd pytorch && eval "$PYTORCH_BUILD_ENV python3 setup.py install" && cd .. \
&& rm -rf pytorch

# Set the working directory in the container
WORKDIR /app

Expand All @@ -74,7 +89,7 @@ RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

ARG EFA_INSTALLER_VERSION=latest
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG AWS_OFI_NCCL_VERSION=1.10.0-aws
ARG NCCL_TESTS_VERSION=master

RUN apt-get update -y && \
Expand All @@ -94,7 +109,7 @@ RUN mkdir -p /var/run/sshd && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set environment variables for OpenMPI, CUDA, EFA, and NCCL
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
Expand All @@ -107,7 +122,7 @@ RUN cd $HOME \

# Install NCCL (version specified)
RUN apt-key del 7fa2af80 && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(uname -m)/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

Expand Down
3 changes: 1 addition & 2 deletions test/images/nvidia-training/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
torch==2.3
transformers==4.29
numpy==1.23
numpy==1.23

0 comments on commit dfda945

Please sign in to comment.