From 817aab3bb3de9321300c70e4f26bbc5850383915 Mon Sep 17 00:00:00 2001 From: borg323 Date: Thu, 4 Apr 2024 01:03:43 +0300 Subject: [PATCH 1/4] move to cuda 11.8 with install script --- appveyor.yml | 8 +++---- dist/README-cuda.txt | 17 +++++++------ dist/README-cudnn.txt | 38 +++++++++++++++++++++++++++++ dist/install-cuda_11_8.cmd | 41 ++++++++++++++++++++++++++++++++ dist/install-dml.cmd | 9 +------ meson.build | 9 +++---- scripts/appveyor_win_package.cmd | 8 ++++--- 7 files changed, 104 insertions(+), 26 deletions(-) create mode 100644 dist/README-cudnn.txt create mode 100644 dist/install-cuda_11_8.cmd diff --git a/appveyor.yml b/appveyor.yml index dc0445dac8..7df0046a84 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -71,10 +71,10 @@ install: - cmd: IF DEFINED CUDNN_INSTALL cuda_10.0.130_win10_network -s nvcc_10.0 cublas_dev_10.0 cublas_10.0 cudart_10.0 - cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile http://developer.download.nvidia.com/compute/redist/cudnn/v7.4.2/cudnn-10.0-windows10-x64-v7.4.2.24.zip - cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.0-windows10-x64-v7.4.2.24.zip -o"%CUDA_PATH%" -- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1" +- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1 -- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe -- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1 +- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe +- cmd: IF DEFINED CUDA_INSTALL cuda_11.8.0_windows_network.exe -s nvcc_11.8 cublas_dev_11.8 cublas_11.8 cudart_11.8 documentation_11.8 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH% - cmd: set PATH=C:\Python36;C:\Python36\scripts;%PATH% - cmd: pip3 install --upgrade meson==0.55.3 @@ -104,7 +104,7 @@ install: cache: - C:\cache - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0' - - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1' + - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8' - C:\projects\lc0\subprojects\packagecache - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 before_build: diff --git a/dist/README-cuda.txt b/dist/README-cuda.txt index 8278cc53cb..4f35003cda 100644 --- a/dist/README-cuda.txt +++ b/dist/README-cuda.txt @@ -4,13 +4,16 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, specifically those of the LeelaChessZero project (https://lczero.org). -This binary uses CUDA and cuDNN dynamic link libraries copyrighted -by Nvidia corporation (http://www.nvidia.com), and redistributed as -permitted by the respective license file (see CUDA.txt section 2.2 -and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are -authorized to redistribute these libraries together with this -package as a whole but not individually. - +This binary uses CUDA dynamic link libraries copyrighted by Nvidia +corporation (http://www.nvidia.com), that can be redistributed as +permitted by the respective license file (see CUDA.txt section 2.2). +For size reasons you will have to get the required files by running +the included `install.cmd` script. If this fails you can get them by +downloading + and +, the required dynamic link libraries are in the respective `bin` +directories. You are authorized to redistribute these libraries +together with this package as a whole but not individually. License diff --git a/dist/README-cudnn.txt b/dist/README-cudnn.txt new file mode 100644 index 0000000000..8278cc53cb --- /dev/null +++ b/dist/README-cudnn.txt @@ -0,0 +1,38 @@ +Lc0 + +Lc0 is a UCI-compliant chess engine designed to play chess via +neural network, specifically those of the LeelaChessZero project +(https://lczero.org). + +This binary uses CUDA and cuDNN dynamic link libraries copyrighted +by Nvidia corporation (http://www.nvidia.com), and redistributed as +permitted by the respective license file (see CUDA.txt section 2.2 +and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are +authorized to redistribute these libraries together with this +package as a whole but not individually. + + +License + +Leela Chess is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Leela Chess is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Leela Chess. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA +Toolkit and the NVIDIA CUDA Deep Neural Network library (or a +modified version of those libraries), containing parts covered by the +terms of the respective license agreement, the licensors of this +Program grant you additional permission to convey the resulting work. + diff --git a/dist/install-cuda_11_8.cmd b/dist/install-cuda_11_8.cmd new file mode 100644 index 0000000000..89a6f2007e --- /dev/null +++ b/dist/install-cuda_11_8.cmd @@ -0,0 +1,41 @@ +@echo off +where /q tar +if errorlevel 1 goto error + +cd /d %~dp0 + +cls +echo Installing the CUDA dlls required by the Lc0 cuda backend. + +echo 1/4. Downloading cudart. +curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.8.89-archive.zip" +if errorlevel 1 goto error + +echo 2/4. Extracting files. +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-11.8.89-archive/bin/cudart64_110.dll >cudart64_110.dll +if errorlevel 1 goto error + +del /q tmp_cudart.zip + +echo 3/4. Downloading cublas. +curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.11.3.6-archive.zip" +if errorlevel 1 goto error + +echo 4/4. Extracting files. +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-11.11.3.6-archive/bin/cublas64_11.dll >cublas64_11.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-11.11.3.6-archive/bin/cublasLt64_11.dll >cublasLt64_11.dll +if errorlevel 1 goto error + +del /q tmp_cublas.zip + +echo Installation successful. +pause +exit /b + +:error +cls +echo Installation failed - see the README for an alternative approach. +pause + diff --git a/dist/install-dml.cmd b/dist/install-dml.cmd index 099f42958c..ca93411a55 100644 --- a/dist/install-dml.cmd +++ b/dist/install-dml.cmd @@ -2,14 +2,7 @@ where /q tar if errorlevel 1 goto error -where /q lc0.exe -if errorlevel 1 cd /d %~dp0 -where /q lc0.exe -if errorlevel 1 ( - echo This script must run in the lc0 folder. - pause - exit /b -) +cd /d %~dp0 cls echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend. diff --git a/meson.build b/meson.build index ef0e0afcdf..85fcf296f9 100644 --- a/meson.build +++ b/meson.build @@ -509,7 +509,10 @@ if get_option('build_backends') # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to # generate code for the major fp16 capable architectures. - if nvcc_extra_args == [] + if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major') + nvcc_extra_args = ['-arch=all-major'] + elif nvcc_extra_args == [] + # Fallback for cuda versions < 11.5 that don't support -arch=all-major. nvcc_arch = '-arch=compute_70' nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90'] if host_machine.system() != 'windows' @@ -528,9 +531,7 @@ if get_option('build_backends') endif endforeach # For forward compatibility. - if nvcc_help.contains('sm_90') # Cuda 12+ - nvcc_extra_args += '-gencode=arch=compute_90,code=compute_90' - elif nvcc_help.contains('sm_80') # Cuda 11+ + if nvcc_help.contains('sm_80') # Cuda 11+ nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80' elif nvcc_help.contains('sm_75') # Cuda 10+ nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75' diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd index 36f98d8eef..6899197276 100644 --- a/scripts/appveyor_win_package.cmd +++ b/scripts/appveyor_win_package.cmd @@ -10,14 +10,13 @@ type "%MIMALLOC_PATH%"\LICENSE |more /P > dist\mimalloc-LICENSE 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%MIMALLOC_PATH%"\out\msvc-x64\Release\mimalloc-redirect.dll 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-readme.md 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-LICENSE -IF %CUDA%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip +IF %CUDNN%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip IF %NAME%==cpu-openblas 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS\dist64\bin\libopenblas.dll IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll" IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll" -IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll" IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE @@ -39,8 +38,11 @@ IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat IF %DX%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_dx.bat +IF %CUDA%==true IF %CUDNN%==false type dist\install-cuda_11_8.cmd |more /P > dist\install.cmd +IF %CUDA%==true IF %CUDNN%==false 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd IF %CUDA%==true copy "%CUDA_PATH%\EULA.txt" dist\CUDA.txt -IF %CUDA%==true type dist\README-cuda.txt |more /P > dist\README.txt +IF %CUDA%==true IF %CUDNN%==false type dist\README-cuda.txt |more /P > dist\README.txt +IF %CUDNN%==true type dist\README-cudnn.txt |more /P > dist\README.txt IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt .\dist\CUDA.txt IF %CUDNN%==true copy "%CUDA_PATH%\cuda\NVIDIA_SLA_cuDNN_Support.txt" dist\CUDNN.txt IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\CUDNN.txt From 75a95ee4e1842af0e6846e2c207e2ab7cbb59d48 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 12 Apr 2024 14:38:48 +0300 Subject: [PATCH 2/4] use -arch=all instead of -arch=all-major --- meson.build | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 85fcf296f9..ed19b5a1d8 100644 --- a/meson.build +++ b/meson.build @@ -509,10 +509,10 @@ if get_option('build_backends') # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to # generate code for the major fp16 capable architectures. - if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major') - nvcc_extra_args = ['-arch=all-major'] + if nvcc_extra_args == [] and nvcc_help.contains('-arch=all') + nvcc_extra_args = ['-arch=all'] elif nvcc_extra_args == [] - # Fallback for cuda versions < 11.5 that don't support -arch=all-major. + # Fallback for cuda versions < 11.5 that don't support -arch=all. nvcc_arch = '-arch=compute_70' nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90'] if host_machine.system() != 'windows' From 09ec4f47dccfac8799ce95c8254247ca2e939851 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 12 Apr 2024 17:02:49 +0300 Subject: [PATCH 3/4] back to all-major with extra code generation for CC 8.9 --- meson.build | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index ed19b5a1d8..cba22eecc0 100644 --- a/meson.build +++ b/meson.build @@ -509,10 +509,14 @@ if get_option('build_backends') # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to # generate code for the major fp16 capable architectures. - if nvcc_extra_args == [] and nvcc_help.contains('-arch=all') - nvcc_extra_args = ['-arch=all'] + if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major') + nvcc_extra_args = ['-arch=all-major'] + # For rtx40x0 compatibility, some drivers seem to need this. + if nvcc_help.contains('sm_89') + nvcc_extra_args += '-gencode=arch=compute_89,code=sm_89' + endif elif nvcc_extra_args == [] - # Fallback for cuda versions < 11.5 that don't support -arch=all. + # Fallback for cuda versions < 11.5 that don't support -arch=all-major. nvcc_arch = '-arch=compute_70' nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90'] if host_machine.system() != 'windows' From 6ff478eac8dbab71075cf02264befafea426f113 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 12 Apr 2024 18:30:50 +0300 Subject: [PATCH 4/4] use all-major for fp32 code as well --- meson.build | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/meson.build b/meson.build index cba22eecc0..e6693d9f81 100644 --- a/meson.build +++ b/meson.build @@ -490,6 +490,8 @@ if get_option('build_backends') nvcc_extra_args = ['-arch=compute_' + cuda_cc, '-code=sm_' + cuda_cc] elif get_option('native_cuda') and nvcc_help.contains('-arch=native') nvcc_extra_args = ['-arch=native'] + elif nvcc_help.contains('-arch=all-major') + nvcc_extra_args = ['-arch=all-major'] endif foreach x : get_option('cudnn_include') cuda_arguments += ['-I', x] @@ -509,13 +511,7 @@ if get_option('build_backends') # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to # generate code for the major fp16 capable architectures. - if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major') - nvcc_extra_args = ['-arch=all-major'] - # For rtx40x0 compatibility, some drivers seem to need this. - if nvcc_help.contains('sm_89') - nvcc_extra_args += '-gencode=arch=compute_89,code=sm_89' - endif - elif nvcc_extra_args == [] + if nvcc_extra_args == [] # Fallback for cuda versions < 11.5 that don't support -arch=all-major. nvcc_arch = '-arch=compute_70' nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']