From 507df5f9a039a7afe6e84c1eb024e01d38325962 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 17 Feb 2022 11:39:33 +0100
Subject: [PATCH 1/4] Reuse at-retry_reclaim from alloc.

---
 src/pool.jl | 167 ++++++++++++++++++++++------------------------------
 1 file changed, 71 insertions(+), 96 deletions(-)

diff --git a/src/pool.jl b/src/pool.jl
index c5d4ead420..378e98b37e 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -164,6 +164,73 @@ function Base.showerror(io::IO, err::OutOfGPUMemoryError)
     memory_status(io)
 end
 
+"""
+    @retry_reclaim isfailed(ret) ex
+
+Run a block of code `ex` repeatedly until it successfully allocates the memory it needs.
+Retries are only attempted when calling `isfailed` with the current return value is true.
+At each try, more and more memory is freed from the CUDA memory pool. When that is not
+possible anymore, the latest returned value will be returned.
+
+This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the
+CUDA memory pool) and return a specific error code when failing to.
+"""
+macro retry_reclaim(isfailed, ex)
+  quote
+    ret = $(esc(ex))
+
+    # slow path, incrementally reclaiming more memory until we succeed
+    if $(esc(isfailed))(ret)
+      state = active_state()
+      is_stream_ordered = stream_ordered(state.device)
+
+      phase = 1
+      while true
+        if is_stream_ordered
+          # NOTE: the stream-ordered allocator only releases memory on actual API calls,
+          #       and not when our synchronization routines query the relevant streams.
+          #       we do still call our routines to minimize the time we block in libcuda.
+          if phase == 1
+            synchronize(state.stream)
+          elseif phase == 2
+            device_synchronize()
+          elseif phase == 3
+            GC.gc(false)
+            device_synchronize()
+          elseif phase == 4
+            GC.gc(true)
+            device_synchronize()
+          elseif phase == 5
+            # in case we had a release threshold configured
+            trim(memory_pool(state.device))
+          else
+            break
+          end
+        else
+          if phase == 1
+            GC.gc(false)
+          elseif phase == 2
+            GC.gc(true)
+          else
+            break
+          end
+        end
+        phase += 1
+
+        ret = $(esc(ex))
+        $(esc(isfailed))(ret) || break
+      end
+    end
+
+    ret
+  end
+end
+
+# XXX: function version for use in CUDAdrv where we haven't loaded pool.jl yet
+function retry_reclaim(f, check)
+  @retry_reclaim check f()
+end
+
 """
     alloc([::BufferType], sz; [stream::CuStream])
 
@@ -192,36 +259,11 @@ end
 
     gctime = 0.0
     time = Base.@elapsed begin
-      buf = nothing
-      if stream_ordered(state.device)
-        # mark the pool as active
-        pool_mark(state.device)
-
-        for phase in 1:5
-            if phase == 2
-                gctime += Base.@elapsed GC.gc(false)
-            elseif phase == 3
-                gctime += Base.@elapsed GC.gc(true)
-            elseif phase == 4
-                synchronize(stream)
-            elseif phase == 5
-                device_synchronize()
-            end
-
-            buf = actual_alloc(sz; async=true, stream)
-            buf === nothing || break
-        end
+      buf = if stream_ordered(state.device)
+        pool_mark(state.device) # mark the pool as active
+        @retry_reclaim isnothing actual_alloc(sz; async=true, stream)
       else
-        for phase in 1:3
-            if phase == 2
-                gctime += Base.@elapsed GC.gc(false)
-            elseif phase == 3
-                gctime += Base.@elapsed GC.gc(true)
-            end
-
-            buf = actual_alloc(sz; async=false)
-            buf === nothing || break
-        end
+        @retry_reclaim isnothing actual_alloc(sz; async=true, stream)
       end
       buf === nothing && throw(OutOfGPUMemoryError(sz))
     end
@@ -297,73 +339,6 @@ function reclaim(sz::Int=typemax(Int))
   end
 end
 
-"""
-    @retry_reclaim isfailed(ret) ex
-
-Run a block of code `ex` repeatedly until it successfully allocates the memory it needs.
-Retries are only attempted when calling `isfailed` with the current return value is true.
-At each try, more and more memory is freed from the CUDA memory pool. When that is not
-possible anymore, the latest returned value will be returned.
-
-This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the
-CUDA memory pool) and return a specific error code when failing to.
-"""
-macro retry_reclaim(isfailed, ex)
-  quote
-    ret = $(esc(ex))
-
-    # slow path, incrementally reclaiming more memory until we succeed
-    if $(esc(isfailed))(ret)
-      state = active_state()
-      is_stream_ordered = stream_ordered(state.device)
-
-      phase = 1
-      while true
-        if is_stream_ordered
-          # NOTE: the stream-ordered allocator only releases memory on actual API calls,
-          #       and not when our synchronization routines query the relevant streams.
-          #       we do still call our routines to minimize the time we block in libcuda.
-          if phase == 1
-            synchronize(state.stream)
-          elseif phase == 2
-            device_synchronize()
-          elseif phase == 3
-            GC.gc(false)
-            device_synchronize()
-          elseif phase == 4
-            GC.gc(true)
-            device_synchronize()
-          elseif phase == 5
-            # in case we had a release threshold configured
-            trim(memory_pool(state.device))
-          else
-            break
-          end
-        else
-          if phase == 1
-            GC.gc(false)
-          elseif phase == 2
-            GC.gc(true)
-          else
-            break
-          end
-        end
-        phase += 1
-
-        ret = $(esc(ex))
-        $(esc(isfailed))(ret) || break
-      end
-    end
-
-    ret
-  end
-end
-
-# XXX: function version for use in CUDAdrv where we haven't loaded pool.jl yet
-function retry_reclaim(f, check)
-  @retry_reclaim check f()
-end
-
 
 ## utilities
 

From 6a3862d74a309999a6d15b0f825ca9b6e48fec9f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 17 Feb 2022 12:49:28 +0100
Subject: [PATCH 2/4] Move the context from the ArrayStorage to the underlying
 buffer.

---
 lib/cudadrv/memory.jl        | 20 ++++++++++++--------
 lib/cudadrv/module/global.jl |  2 +-
 src/array.jl                 | 26 ++++++++++++--------------
 test/array.jl                |  2 +-
 test/pointer.jl              |  2 +-
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 8d51fc26b2..d16d13211d 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -42,13 +42,14 @@ Base.unsafe_convert(T::Type{<:Union{Ptr,CuPtr,CuArrayPtr}}, buf::AbstractBuffer)
 A buffer of device memory residing on the GPU.
 """
 struct DeviceBuffer <: AbstractBuffer
+    ctx::CuContext
     ptr::CuPtr{Cvoid}
     bytesize::Int
 
     async::Bool
 end
 
-DeviceBuffer() = DeviceBuffer(CU_NULL, 0, false)
+DeviceBuffer() = DeviceBuffer(context(), CU_NULL, 0, false)
 
 Base.pointer(buf::DeviceBuffer) = buf.ptr
 Base.sizeof(buf::DeviceBuffer) = buf.bytesize
@@ -85,7 +86,7 @@ function alloc(::Type{DeviceBuffer}, bytesize::Integer;
         CUDA.cuMemAlloc_v2(ptr_ref, bytesize)
     end
 
-    return DeviceBuffer(reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, async)
+    return DeviceBuffer(context(), reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, async)
 end
 
 function free(buf::DeviceBuffer; stream::Union{Nothing,CuStream}=nothing)
@@ -109,11 +110,12 @@ end
 A buffer of pinned memory on the CPU, possibly accessible on the GPU.
 """
 struct HostBuffer <: AbstractBuffer
+    ctx::CuContext
     ptr::Ptr{Cvoid}
     bytesize::Int
 end
 
-HostBuffer() = HostBuffer(C_NULL, 0)
+HostBuffer() = HostBuffer(context(), C_NULL, 0)
 
 Base.pointer(buf::HostBuffer) = buf.ptr
 Base.sizeof(buf::HostBuffer) = buf.bytesize
@@ -157,7 +159,7 @@ function alloc(::Type{HostBuffer}, bytesize::Integer, flags=0)
     ptr_ref = Ref{Ptr{Cvoid}}()
     CUDA.cuMemHostAlloc(ptr_ref, bytesize, flags)
 
-    return HostBuffer(ptr_ref[], bytesize)
+    return HostBuffer(context(), ptr_ref[], bytesize)
 end
 
 
@@ -179,7 +181,7 @@ function register(::Type{HostBuffer}, ptr::Ptr, bytesize::Integer, flags=0)
 
     CUDA.cuMemHostRegister_v2(ptr, bytesize, flags)
 
-    return HostBuffer(ptr, bytesize)
+    return HostBuffer(context(), ptr, bytesize)
 end
 
 """
@@ -208,11 +210,12 @@ end
 A managed buffer that is accessible on both the CPU and GPU.
 """
 struct UnifiedBuffer <: AbstractBuffer
+    ctx::CuContext
     ptr::CuPtr{Cvoid}
     bytesize::Int
 end
 
-UnifiedBuffer() = UnifiedBuffer(CU_NULL, 0)
+UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0)
 
 Base.pointer(buf::UnifiedBuffer) = buf.ptr
 Base.sizeof(buf::UnifiedBuffer) = buf.bytesize
@@ -241,7 +244,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer,
     ptr_ref = Ref{CuPtr{Cvoid}}()
     CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags)
 
-    return UnifiedBuffer(ptr_ref[], bytesize)
+    return UnifiedBuffer(context(), ptr_ref[], bytesize)
 end
 
 
@@ -281,6 +284,7 @@ end
 ## array buffer
 
 mutable struct ArrayBuffer{T,N} <: AbstractBuffer
+    ctx::CuContext
     ptr::CuArrayPtr{T}
     dims::Dims{N}
 end
@@ -342,7 +346,7 @@ function alloc(::Type{<:ArrayBuffer{T}}, dims::Dims{N}) where {T,N}
     CUDA.cuArray3DCreate_v2(handle_ref, allocateArray_ref)
     ptr = reinterpret(CuArrayPtr{T}, handle_ref[])
 
-    return ArrayBuffer{T,N}(ptr, dims)
+    return ArrayBuffer{T,N}(context(), ptr, dims)
 end
 
 function free(buf::ArrayBuffer)
diff --git a/lib/cudadrv/module/global.jl b/lib/cudadrv/module/global.jl
index d25e171a1e..0369acc3db 100644
--- a/lib/cudadrv/module/global.jl
+++ b/lib/cudadrv/module/global.jl
@@ -23,7 +23,7 @@ struct CuGlobal{T}
         if nbytes_ref[] != sizeof(T)
             throw(ArgumentError("size of global '$name' does not match type parameter type $T"))
         end
-        buf = Mem.DeviceBuffer(ptr_ref[], nbytes_ref[], false)
+        buf = Mem.DeviceBuffer(context(), ptr_ref[], nbytes_ref[], false)
 
         return new{T}(buf)
     end
diff --git a/src/array.jl b/src/array.jl
index 81ad6ee0f8..2dd7686586 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -9,8 +9,6 @@ export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_unified
 struct ArrayStorage{B}
   buffer::B
 
-  ctx::CuContext
-
   # the refcount also encodes the state of the array:
   # < 0: unmanaged
   # = 0: freed
@@ -18,8 +16,8 @@ struct ArrayStorage{B}
   refcount::Threads.Atomic{Int}
 end
 
-ArrayStorage(buf::B, ctx, state::Int) where {B} =
-  ArrayStorage{B}(buf, ctx, Threads.Atomic{Int}(state))
+ArrayStorage(buf::B, state::Int) where {B} =
+  ArrayStorage{B}(buf, Threads.Atomic{Int}(state))
 
 
 ## array type
@@ -42,7 +40,7 @@ mutable struct CuArray{T,N,B} <: AbstractGPUArray{T,N}
       maxsize
     end
     buf = alloc(B, bufsize)
-    storage = ArrayStorage(buf, context(), 1)
+    storage = ArrayStorage(buf, 1)
     obj = new{T,N,B}(storage, maxsize, 0, dims)
     finalizer(unsafe_finalize!, obj)
   end
@@ -77,7 +75,7 @@ function unsafe_free!(xs::CuArray, stream::CuStream=stream())
 
   refcount = Threads.atomic_add!(xs.storage.refcount, -1)
   if refcount == 1
-    context!(xs.storage.ctx; skip_destroyed=true) do
+    context!(context(xs); skip_destroyed=true) do
       free(xs.storage.buffer; stream)
     end
   end
@@ -196,12 +194,12 @@ function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,
   buf = try
     typ = memory_type(ptr)
     if is_managed(ptr)
-      Mem.UnifiedBuffer(ptr, sz)
+      Mem.UnifiedBuffer(ctx, ptr, sz)
     elseif typ == CU_MEMORYTYPE_DEVICE
       # TODO: can we identify whether this pointer was allocated asynchronously?
-      Mem.DeviceBuffer(ptr, sz, false)
+      Mem.DeviceBuffer(ctx, ptr, sz, false)
     elseif typ == CU_MEMORYTYPE_HOST
-      Mem.HostBuffer(host_pointer(ptr), sz)
+      Mem.HostBuffer(ctx, host_pointer(ptr), sz)
     else
       error("Unknown memory type; please file an issue.")
     end
@@ -209,7 +207,7 @@ function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,
       error("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?")
   end
 
-  storage = ArrayStorage(buf, ctx, own ? 1 : -1)
+  storage = ArrayStorage(buf, own ? 1 : -1)
   CuArray{T, length(dims)}(storage, dims)
 end
 
@@ -232,12 +230,12 @@ Base.sizeof(x::CuArray) = Base.elsize(x) * length(x)
 
 function context(A::CuArray)
   A.storage === nothing && throw(UndefRefError())
-  return A.storage.ctx
+  return A.storage.buffer.ctx
 end
 
 function device(A::CuArray)
   A.storage === nothing && throw(UndefRefError())
-  return device(A.storage.ctx)
+  return device(A.storage.buffer.ctx)
 end
 
 
@@ -826,14 +824,14 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
     maxsize
   end
 
-  new_storage = context!(A.storage.ctx) do
+  new_storage = context!(context(A)) do
     buf = alloc(typeof(A.storage.buffer), bufsize)
     ptr = convert(CuPtr{T}, buf)
     m = min(length(A), n)
     if m > 0
       unsafe_copyto!(ptr, pointer(A), m)
     end
-    ArrayStorage(buf, A.storage.ctx, 1)
+    ArrayStorage(buf, 1)
   end
 
   unsafe_free!(A)
diff --git a/test/array.jl b/test/array.jl
index 37184d793e..f6b65d6383 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -36,7 +36,7 @@ import Adapt
       @test eltype(a) == eltype(b)
       @test ndims(a) == ndims(b)
       @test a.storage.buffer.ptr == b.storage.buffer.ptr
-      @test a.storage.ctx == b.storage.ctx
+      @test a.storage.buffer.ctx == b.storage.buffer.ctx
       @test a.maxsize == b.maxsize
       @test a.offset == b.offset
       @test a.dims == b.dims
diff --git a/test/pointer.jl b/test/pointer.jl
index 2368114ed9..fe78cc6271 100644
--- a/test/pointer.jl
+++ b/test/pointer.jl
@@ -41,7 +41,7 @@ ccall(:clock, Nothing, (Ptr{Int},), a)
 @test_throws Exception ccall(:clock, Nothing, (CuPtr{Int},), a)
 ccall(:clock, Nothing, (PtrOrCuPtr{Int},), a)
 
-b = CuArray{eltype(a), ndims(a)}(CUDA.ArrayStorage(Mem.DeviceBuffer(), context(), 0), size(a))
+b = CuArray{eltype(a), ndims(a)}(CUDA.ArrayStorage(Mem.DeviceBuffer(), 0), size(a))
 ccall(:clock, Nothing, (CuPtr{Int},), b)
 @test_throws Exception ccall(:clock, Nothing, (Ptr{Int},), b)
 ccall(:clock, Nothing, (PtrOrCuPtr{Int},), b)

From 92b5fe2e4aa894013ddc9c36493a0ece4db08f9c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 17 Feb 2022 12:53:17 +0100
Subject: [PATCH 3/4] Only switch contexts during finalization, not when early
 freeing.

---
 lib/cufft/fft.jl | 10 ++++++----
 src/array.jl     |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/cufft/fft.jl b/lib/cufft/fft.jl
index a2e03f75ac..e66e576480 100644
--- a/lib/cufft/fft.jl
+++ b/lib/cufft/fft.jl
@@ -25,13 +25,15 @@ abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end
 Base.convert(::Type{cufftHandle}, p::CuFFTPlan) = p.handle
 
 function CUDA.unsafe_free!(plan::CuFFTPlan, stream::CuStream=stream())
-    context!(plan.ctx; skip_destroyed=true) do
-        cufftDestroy(plan)
-    end
+    cufftDestroy(plan)
     unsafe_free!(plan.workarea, stream)
 end
 
-unsafe_finalize!(plan::CuFFTPlan) = unsafe_free!(plan, default_stream())
+function unsafe_finalize!(plan::CuFFTPlan)
+    context!(plan.ctx; skip_destroyed=true) do
+        unsafe_free!(plan, default_stream())
+    end
+end
 
 mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
     handle::cufftHandle
diff --git a/src/array.jl b/src/array.jl
index 2dd7686586..72fe68e346 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -97,6 +97,8 @@ function unsafe_finalize!(xs::CuArray)
   # streams involved, or by refcounting uses and decrementing that refcount after the
   # operation using `cuLaunchHostFunc`. See CUDA.jl#778 and CUDA.jl#780 for details.
   unsafe_free!(xs, default_stream())
+  # NOTE: we don't switch contexts here, but in unsafe_free!, as arrays are refcounted
+  #       and we may not have to free the memory yet.
 end
 
 

From ea43c67c12bc9c9a7a1ea8b035a2430993f2839a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 17 Feb 2022 13:23:25 +0100
Subject: [PATCH 4/4] Don't inspect the active state during pool free.

Avoids querying the current context, which may cause a task switch.
---
 lib/cudadrv/context.jl |  2 ++
 src/pool.jl            | 23 +++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/lib/cudadrv/context.jl b/lib/cudadrv/context.jl
index bdb7996dda..cb5462179a 100644
--- a/lib/cudadrv/context.jl
+++ b/lib/cudadrv/context.jl
@@ -39,6 +39,8 @@ mutable struct CuContext
     valid::Bool
 
     function new_unique(handle)
+        # XXX: this makes it dangerous to call this function from finalizers.
+        #      can we do without the lock?
         Base.@lock context_lock get!(valid_contexts, handle) do
             new(handle, true)
         end
diff --git a/src/pool.jl b/src/pool.jl
index 378e98b37e..f16e362839 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -308,12 +308,27 @@ Releases a buffer `buf` to the memory pool.
   return
 end
 @inline function _free(buf::Mem.DeviceBuffer; stream::Union{Nothing,CuStream})
-    state = active_state()
-    if stream_ordered(state.device)
+    # NOTE: this function is often called from finalizers, from which we can't switch tasks,
+    #       so we need to take care not to call managed functions (i.e. functions that may
+    #       initialize the CUDA context) because querying the active context using
+    #       `current_context()` takes a lock
+
+    # verify that the caller has called `context!` already, which eagerly activates the
+    # context (i.e. doesn't only set it in the state, but configures the CUDA APIs).
+    handle_ref = Ref{CUcontext}()
+    cuCtxGetCurrent(handle_ref)
+    if buf.ctx.handle != handle_ref[]
+      error("Trying to free $buf from a different context than the one it was allocated from ($(handle_ref[]))")
+    end
+
+    dev = current_device()
+    if stream_ordered(dev)
       # mark the pool as active
-      pool_mark(state.device)
+      pool_mark(dev)
 
-      actual_free(buf; stream=something(stream, state.stream))
+      # for safety, we default to the default stream and force this operation to be ordered
+      # against all other streams. to opt out of this, pass a specific stream instead.
+      actual_free(buf; stream=something(stream, default_stream()))
     else
       actual_free(buf)
     end