From 817aab3bb3de9321300c70e4f26bbc5850383915 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Thu, 4 Apr 2024 01:03:43 +0300
Subject: [PATCH 1/4] move to cuda 11.8 with install script

---
 appveyor.yml                     |  8 +++----
 dist/README-cuda.txt             | 17 +++++++------
 dist/README-cudnn.txt            | 38 +++++++++++++++++++++++++++++
 dist/install-cuda_11_8.cmd       | 41 ++++++++++++++++++++++++++++++++
 dist/install-dml.cmd             |  9 +------
 meson.build                      |  9 +++----
 scripts/appveyor_win_package.cmd |  8 ++++---
 7 files changed, 104 insertions(+), 26 deletions(-)
 create mode 100644 dist/README-cudnn.txt
 create mode 100644 dist/install-cuda_11_8.cmd

diff --git a/appveyor.yml b/appveyor.yml
index dc0445dac8..7df0046a84 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -71,10 +71,10 @@ install:
 - cmd: IF DEFINED CUDNN_INSTALL cuda_10.0.130_win10_network -s nvcc_10.0 cublas_dev_10.0 cublas_10.0 cudart_10.0
 - cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile http://developer.download.nvidia.com/compute/redist/cudnn/v7.4.2/cudnn-10.0-windows10-x64-v7.4.2.24.zip
 - cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.0-windows10-x64-v7.4.2.24.zip -o"%CUDA_PATH%"
-- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
 - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
-- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
-- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL cuda_11.8.0_windows_network.exe -s nvcc_11.8 cublas_dev_11.8 cublas_11.8 cudart_11.8 documentation_11.8
 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH%
 - cmd: set PATH=C:\Python36;C:\Python36\scripts;%PATH%
 - cmd: pip3 install --upgrade meson==0.55.3
@@ -104,7 +104,7 @@ install:
 cache:
   - C:\cache
   - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0'
-  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
   - C:\projects\lc0\subprojects\packagecache
   - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64
 before_build:
diff --git a/dist/README-cuda.txt b/dist/README-cuda.txt
index 8278cc53cb..4f35003cda 100644
--- a/dist/README-cuda.txt
+++ b/dist/README-cuda.txt
@@ -4,13 +4,16 @@ Lc0 is a UCI-compliant chess engine designed to play chess via
 neural network, specifically those of the LeelaChessZero project
 (https://lczero.org).
 
-This binary uses CUDA and cuDNN dynamic link libraries copyrighted
-by Nvidia corporation (http://www.nvidia.com), and redistributed as
-permitted by the respective license file (see CUDA.txt section 2.2
-and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are
-authorized to redistribute these libraries together with this
-package as a whole but not individually.
-
+This binary uses CUDA dynamic link libraries copyrighted by Nvidia
+corporation (http://www.nvidia.com), that can be redistributed as
+permitted by the respective license file (see CUDA.txt section 2.2).
+For size reasons you will have to get the required files by running
+the included `install.cmd` script. If this fails you can get them by
+downloading
+<https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.8.89-archive.zip> and
+<https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.11.3.6-archive.zip>, the required dynamic link libraries are in the respective `bin`
+directories. You are authorized to redistribute these libraries
+together with this package as a whole but not individually.
 
 License
 
diff --git a/dist/README-cudnn.txt b/dist/README-cudnn.txt
new file mode 100644
index 0000000000..8278cc53cb
--- /dev/null
+++ b/dist/README-cudnn.txt
@@ -0,0 +1,38 @@
+Lc0
+
+Lc0 is a UCI-compliant chess engine designed to play chess via
+neural network, specifically those of the LeelaChessZero project
+(https://lczero.org).
+
+This binary uses CUDA and cuDNN dynamic link libraries copyrighted
+by Nvidia corporation (http://www.nvidia.com), and redistributed as
+permitted by the respective license file (see CUDA.txt section 2.2
+and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are
+authorized to redistribute these libraries together with this
+package as a whole but not individually.
+
+
+License
+
+Leela Chess is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Leela Chess is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+modified version of those libraries), containing parts covered by the
+terms of the respective license agreement, the licensors of this
+Program grant you additional permission to convey the resulting work.
+
diff --git a/dist/install-cuda_11_8.cmd b/dist/install-cuda_11_8.cmd
new file mode 100644
index 0000000000..89a6f2007e
--- /dev/null
+++ b/dist/install-cuda_11_8.cmd
@@ -0,0 +1,41 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+echo Installing the CUDA dlls required by the Lc0 cuda backend.
+
+echo 1/4. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.8.89-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/4. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-11.8.89-archive/bin/cudart64_110.dll >cudart64_110.dll
+if errorlevel 1 goto error
+
+del /q tmp_cudart.zip
+
+echo 3/4. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.11.3.6-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/4. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-11.11.3.6-archive/bin/cublas64_11.dll >cublas64_11.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-11.11.3.6-archive/bin/cublasLt64_11.dll >cublasLt64_11.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo Installation successful.
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - see the README for an alternative approach.
+pause
+
diff --git a/dist/install-dml.cmd b/dist/install-dml.cmd
index 099f42958c..ca93411a55 100644
--- a/dist/install-dml.cmd
+++ b/dist/install-dml.cmd
@@ -2,14 +2,7 @@
 where /q tar
 if errorlevel 1 goto error
 
-where /q lc0.exe
-if errorlevel 1 cd /d %~dp0
-where /q lc0.exe
-if errorlevel 1 (
-  echo This script must run in the lc0 folder.
-  pause
-  exit /b
-)
+cd /d %~dp0
 
 cls
 echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend.
diff --git a/meson.build b/meson.build
index ef0e0afcdf..85fcf296f9 100644
--- a/meson.build
+++ b/meson.build
@@ -509,7 +509,10 @@ if get_option('build_backends')
 
     # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to
     # generate code for the major fp16 capable architectures.
-    if nvcc_extra_args == []
+    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major')
+      nvcc_extra_args = ['-arch=all-major']
+    elif nvcc_extra_args == []
+      # Fallback for cuda versions < 11.5 that don't support -arch=all-major.
       nvcc_arch = '-arch=compute_70'
       nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']
       if host_machine.system() != 'windows'
@@ -528,9 +531,7 @@ if get_option('build_backends')
         endif
       endforeach
       # For forward compatibility.
-      if nvcc_help.contains('sm_90') # Cuda 12+
-        nvcc_extra_args += '-gencode=arch=compute_90,code=compute_90'
-      elif nvcc_help.contains('sm_80') # Cuda 11+
+      if nvcc_help.contains('sm_80') # Cuda 11+
         nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80'
       elif nvcc_help.contains('sm_75') # Cuda 10+
         nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75'
diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd
index 36f98d8eef..6899197276 100644
--- a/scripts/appveyor_win_package.cmd
+++ b/scripts/appveyor_win_package.cmd
@@ -10,14 +10,13 @@ type "%MIMALLOC_PATH%"\LICENSE |more /P > dist\mimalloc-LICENSE
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%MIMALLOC_PATH%"\out\msvc-x64\Release\mimalloc-redirect.dll
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-readme.md
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-LICENSE
-IF %CUDA%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip
+IF %CUDNN%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip
 IF %NAME%==cpu-openblas 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS\dist64\bin\libopenblas.dll
 IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll"
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll"
-IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
 IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
 IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
 IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
@@ -39,8 +38,11 @@ IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat
 IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat
 IF %DX%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_dx.bat
+IF %CUDA%==true IF %CUDNN%==false type dist\install-cuda_11_8.cmd |more /P > dist\install.cmd
+IF %CUDA%==true IF %CUDNN%==false 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
 IF %CUDA%==true copy "%CUDA_PATH%\EULA.txt" dist\CUDA.txt
-IF %CUDA%==true type dist\README-cuda.txt |more /P > dist\README.txt
+IF %CUDA%==true IF %CUDNN%==false type dist\README-cuda.txt |more /P > dist\README.txt
+IF %CUDNN%==true type dist\README-cudnn.txt |more /P > dist\README.txt
 IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt .\dist\CUDA.txt
 IF %CUDNN%==true copy "%CUDA_PATH%\cuda\NVIDIA_SLA_cuDNN_Support.txt" dist\CUDNN.txt
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\CUDNN.txt

From 75a95ee4e1842af0e6846e2c207e2ab7cbb59d48 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 12 Apr 2024 14:38:48 +0300
Subject: [PATCH 2/4] use -arch=all instead of -arch=all-major

---
 meson.build | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 85fcf296f9..ed19b5a1d8 100644
--- a/meson.build
+++ b/meson.build
@@ -509,10 +509,10 @@ if get_option('build_backends')
 
     # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to
     # generate code for the major fp16 capable architectures.
-    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major')
-      nvcc_extra_args = ['-arch=all-major']
+    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all')
+      nvcc_extra_args = ['-arch=all']
     elif nvcc_extra_args == []
-      # Fallback for cuda versions < 11.5 that don't support -arch=all-major.
+      # Fallback for cuda versions < 11.5 that don't support -arch=all.
       nvcc_arch = '-arch=compute_70'
       nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']
       if host_machine.system() != 'windows'

From 09ec4f47dccfac8799ce95c8254247ca2e939851 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 12 Apr 2024 17:02:49 +0300
Subject: [PATCH 3/4] back to all-major with extra code generation for CC 8.9

---
 meson.build | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index ed19b5a1d8..cba22eecc0 100644
--- a/meson.build
+++ b/meson.build
@@ -509,10 +509,14 @@ if get_option('build_backends')
 
     # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to
     # generate code for the major fp16 capable architectures.
-    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all')
-      nvcc_extra_args = ['-arch=all']
+    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major')
+      nvcc_extra_args = ['-arch=all-major']
+      # For rtx40x0 compatibility, some drivers seem to need this.
+      if nvcc_help.contains('sm_89')
+        nvcc_extra_args += '-gencode=arch=compute_89,code=sm_89'
+      endif
     elif nvcc_extra_args == []
-      # Fallback for cuda versions < 11.5 that don't support -arch=all.
+      # Fallback for cuda versions < 11.5 that don't support -arch=all-major.
       nvcc_arch = '-arch=compute_70'
       nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']
       if host_machine.system() != 'windows'

From 6ff478eac8dbab71075cf02264befafea426f113 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 12 Apr 2024 18:30:50 +0300
Subject: [PATCH 4/4] use all-major for fp32 code as well

---
 meson.build | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/meson.build b/meson.build
index cba22eecc0..e6693d9f81 100644
--- a/meson.build
+++ b/meson.build
@@ -490,6 +490,8 @@ if get_option('build_backends')
       nvcc_extra_args = ['-arch=compute_' + cuda_cc, '-code=sm_' + cuda_cc]
     elif get_option('native_cuda') and nvcc_help.contains('-arch=native')
       nvcc_extra_args = ['-arch=native']
+    elif nvcc_help.contains('-arch=all-major')
+      nvcc_extra_args = ['-arch=all-major']
     endif
     foreach x : get_option('cudnn_include')
       cuda_arguments += ['-I', x]
@@ -509,13 +511,7 @@ if get_option('build_backends')
 
     # Handling of fp16 cuda code: If nvcc_extra_args is empty add options to
     # generate code for the major fp16 capable architectures.
-    if nvcc_extra_args == [] and nvcc_help.contains('-arch=all-major')
-      nvcc_extra_args = ['-arch=all-major']
-      # For rtx40x0 compatibility, some drivers seem to need this.
-      if nvcc_help.contains('sm_89')
-        nvcc_extra_args += '-gencode=arch=compute_89,code=sm_89'
-      endif
-    elif nvcc_extra_args == []
+    if nvcc_extra_args == []
       # Fallback for cuda versions < 11.5 that don't support -arch=all-major.
       nvcc_arch = '-arch=compute_70'
       nvcc_sm_list = ['sm_70', 'sm_75', 'sm_80', 'sm_90']