From 507df5f9a039a7afe6e84c1eb024e01d38325962 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 17 Feb 2022 11:39:33 +0100 Subject: [PATCH 1/4] Reuse at-retry_reclaim from alloc. --- src/pool.jl | 167 ++++++++++++++++++++++------------------------------ 1 file changed, 71 insertions(+), 96 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index c5d4ead420..378e98b37e 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -164,6 +164,73 @@ function Base.showerror(io::IO, err::OutOfGPUMemoryError) memory_status(io) end +""" + @retry_reclaim isfailed(ret) ex + +Run a block of code `ex` repeatedly until it successfully allocates the memory it needs. +Retries are only attempted when calling `isfailed` with the current return value is true. +At each try, more and more memory is freed from the CUDA memory pool. When that is not +possible anymore, the latest returned value will be returned. + +This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the +CUDA memory pool) and return a specific error code when failing to. +""" +macro retry_reclaim(isfailed, ex) + quote + ret = $(esc(ex)) + + # slow path, incrementally reclaiming more memory until we succeed + if $(esc(isfailed))(ret) + state = active_state() + is_stream_ordered = stream_ordered(state.device) + + phase = 1 + while true + if is_stream_ordered + # NOTE: the stream-ordered allocator only releases memory on actual API calls, + # and not when our synchronization routines query the relevant streams. + # we do still call our routines to minimize the time we block in libcuda. + if phase == 1 + synchronize(state.stream) + elseif phase == 2 + device_synchronize() + elseif phase == 3 + GC.gc(false) + device_synchronize() + elseif phase == 4 + GC.gc(true) + device_synchronize() + elseif phase == 5 + # in case we had a release threshold configured + trim(memory_pool(state.device)) + else + break + end + else + if phase == 1 + GC.gc(false) + elseif phase == 2 + GC.gc(true) + else + break + end + end + phase += 1 + + ret = $(esc(ex)) + $(esc(isfailed))(ret) || break + end + end + + ret + end +end + +# XXX: function version for use in CUDAdrv where we haven't loaded pool.jl yet +function retry_reclaim(f, check) + @retry_reclaim check f() +end + """ alloc([::BufferType], sz; [stream::CuStream]) @@ -192,36 +259,11 @@ end gctime = 0.0 time = Base.@elapsed begin - buf = nothing - if stream_ordered(state.device) - # mark the pool as active - pool_mark(state.device) - - for phase in 1:5 - if phase == 2 - gctime += Base.@elapsed GC.gc(false) - elseif phase == 3 - gctime += Base.@elapsed GC.gc(true) - elseif phase == 4 - synchronize(stream) - elseif phase == 5 - device_synchronize() - end - - buf = actual_alloc(sz; async=true, stream) - buf === nothing || break - end + buf = if stream_ordered(state.device) + pool_mark(state.device) # mark the pool as active + @retry_reclaim isnothing actual_alloc(sz; async=true, stream) else - for phase in 1:3 - if phase == 2 - gctime += Base.@elapsed GC.gc(false) - elseif phase == 3 - gctime += Base.@elapsed GC.gc(true) - end - - buf = actual_alloc(sz; async=false) - buf === nothing || break - end + @retry_reclaim isnothing actual_alloc(sz; async=true, stream) end buf === nothing && throw(OutOfGPUMemoryError(sz)) end @@ -297,73 +339,6 @@ function reclaim(sz::Int=typemax(Int)) end end -""" - @retry_reclaim isfailed(ret) ex - -Run a block of code `ex` repeatedly until it successfully allocates the memory it needs. -Retries are only attempted when calling `isfailed` with the current return value is true. -At each try, more and more memory is freed from the CUDA memory pool. When that is not -possible anymore, the latest returned value will be returned. - -This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the -CUDA memory pool) and return a specific error code when failing to. -""" -macro retry_reclaim(isfailed, ex) - quote - ret = $(esc(ex)) - - # slow path, incrementally reclaiming more memory until we succeed - if $(esc(isfailed))(ret) - state = active_state() - is_stream_ordered = stream_ordered(state.device) - - phase = 1 - while true - if is_stream_ordered - # NOTE: the stream-ordered allocator only releases memory on actual API calls, - # and not when our synchronization routines query the relevant streams. - # we do still call our routines to minimize the time we block in libcuda. - if phase == 1 - synchronize(state.stream) - elseif phase == 2 - device_synchronize() - elseif phase == 3 - GC.gc(false) - device_synchronize() - elseif phase == 4 - GC.gc(true) - device_synchronize() - elseif phase == 5 - # in case we had a release threshold configured - trim(memory_pool(state.device)) - else - break - end - else - if phase == 1 - GC.gc(false) - elseif phase == 2 - GC.gc(true) - else - break - end - end - phase += 1 - - ret = $(esc(ex)) - $(esc(isfailed))(ret) || break - end - end - - ret - end -end - -# XXX: function version for use in CUDAdrv where we haven't loaded pool.jl yet -function retry_reclaim(f, check) - @retry_reclaim check f() -end - ## utilities From 6a3862d74a309999a6d15b0f825ca9b6e48fec9f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 17 Feb 2022 12:49:28 +0100 Subject: [PATCH 2/4] Move the context from the ArrayStorage to the underlying buffer. --- lib/cudadrv/memory.jl | 20 ++++++++++++-------- lib/cudadrv/module/global.jl | 2 +- src/array.jl | 26 ++++++++++++-------------- test/array.jl | 2 +- test/pointer.jl | 2 +- 5 files changed, 27 insertions(+), 25 deletions(-) diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl index 8d51fc26b2..d16d13211d 100644 --- a/lib/cudadrv/memory.jl +++ b/lib/cudadrv/memory.jl @@ -42,13 +42,14 @@ Base.unsafe_convert(T::Type{<:Union{Ptr,CuPtr,CuArrayPtr}}, buf::AbstractBuffer) A buffer of device memory residing on the GPU. """ struct DeviceBuffer <: AbstractBuffer + ctx::CuContext ptr::CuPtr{Cvoid} bytesize::Int async::Bool end -DeviceBuffer() = DeviceBuffer(CU_NULL, 0, false) +DeviceBuffer() = DeviceBuffer(context(), CU_NULL, 0, false) Base.pointer(buf::DeviceBuffer) = buf.ptr Base.sizeof(buf::DeviceBuffer) = buf.bytesize @@ -85,7 +86,7 @@ function alloc(::Type{DeviceBuffer}, bytesize::Integer; CUDA.cuMemAlloc_v2(ptr_ref, bytesize) end - return DeviceBuffer(reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, async) + return DeviceBuffer(context(), reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, async) end function free(buf::DeviceBuffer; stream::Union{Nothing,CuStream}=nothing) @@ -109,11 +110,12 @@ end A buffer of pinned memory on the CPU, possibly accessible on the GPU. """ struct HostBuffer <: AbstractBuffer + ctx::CuContext ptr::Ptr{Cvoid} bytesize::Int end -HostBuffer() = HostBuffer(C_NULL, 0) +HostBuffer() = HostBuffer(context(), C_NULL, 0) Base.pointer(buf::HostBuffer) = buf.ptr Base.sizeof(buf::HostBuffer) = buf.bytesize @@ -157,7 +159,7 @@ function alloc(::Type{HostBuffer}, bytesize::Integer, flags=0) ptr_ref = Ref{Ptr{Cvoid}}() CUDA.cuMemHostAlloc(ptr_ref, bytesize, flags) - return HostBuffer(ptr_ref[], bytesize) + return HostBuffer(context(), ptr_ref[], bytesize) end @@ -179,7 +181,7 @@ function register(::Type{HostBuffer}, ptr::Ptr, bytesize::Integer, flags=0) CUDA.cuMemHostRegister_v2(ptr, bytesize, flags) - return HostBuffer(ptr, bytesize) + return HostBuffer(context(), ptr, bytesize) end """ @@ -208,11 +210,12 @@ end A managed buffer that is accessible on both the CPU and GPU. """ struct UnifiedBuffer <: AbstractBuffer + ctx::CuContext ptr::CuPtr{Cvoid} bytesize::Int end -UnifiedBuffer() = UnifiedBuffer(CU_NULL, 0) +UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0) Base.pointer(buf::UnifiedBuffer) = buf.ptr Base.sizeof(buf::UnifiedBuffer) = buf.bytesize @@ -241,7 +244,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer, ptr_ref = Ref{CuPtr{Cvoid}}() CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags) - return UnifiedBuffer(ptr_ref[], bytesize) + return UnifiedBuffer(context(), ptr_ref[], bytesize) end @@ -281,6 +284,7 @@ end ## array buffer mutable struct ArrayBuffer{T,N} <: AbstractBuffer + ctx::CuContext ptr::CuArrayPtr{T} dims::Dims{N} end @@ -342,7 +346,7 @@ function alloc(::Type{<:ArrayBuffer{T}}, dims::Dims{N}) where {T,N} CUDA.cuArray3DCreate_v2(handle_ref, allocateArray_ref) ptr = reinterpret(CuArrayPtr{T}, handle_ref[]) - return ArrayBuffer{T,N}(ptr, dims) + return ArrayBuffer{T,N}(context(), ptr, dims) end function free(buf::ArrayBuffer) diff --git a/lib/cudadrv/module/global.jl b/lib/cudadrv/module/global.jl index d25e171a1e..0369acc3db 100644 --- a/lib/cudadrv/module/global.jl +++ b/lib/cudadrv/module/global.jl @@ -23,7 +23,7 @@ struct CuGlobal{T} if nbytes_ref[] != sizeof(T) throw(ArgumentError("size of global '$name' does not match type parameter type $T")) end - buf = Mem.DeviceBuffer(ptr_ref[], nbytes_ref[], false) + buf = Mem.DeviceBuffer(context(), ptr_ref[], nbytes_ref[], false) return new{T}(buf) end diff --git a/src/array.jl b/src/array.jl index 81ad6ee0f8..2dd7686586 100644 --- a/src/array.jl +++ b/src/array.jl @@ -9,8 +9,6 @@ export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_unified struct ArrayStorage{B} buffer::B - ctx::CuContext - # the refcount also encodes the state of the array: # < 0: unmanaged # = 0: freed @@ -18,8 +16,8 @@ struct ArrayStorage{B} refcount::Threads.Atomic{Int} end -ArrayStorage(buf::B, ctx, state::Int) where {B} = - ArrayStorage{B}(buf, ctx, Threads.Atomic{Int}(state)) +ArrayStorage(buf::B, state::Int) where {B} = + ArrayStorage{B}(buf, Threads.Atomic{Int}(state)) ## array type @@ -42,7 +40,7 @@ mutable struct CuArray{T,N,B} <: AbstractGPUArray{T,N} maxsize end buf = alloc(B, bufsize) - storage = ArrayStorage(buf, context(), 1) + storage = ArrayStorage(buf, 1) obj = new{T,N,B}(storage, maxsize, 0, dims) finalizer(unsafe_finalize!, obj) end @@ -77,7 +75,7 @@ function unsafe_free!(xs::CuArray, stream::CuStream=stream()) refcount = Threads.atomic_add!(xs.storage.refcount, -1) if refcount == 1 - context!(xs.storage.ctx; skip_destroyed=true) do + context!(context(xs); skip_destroyed=true) do free(xs.storage.buffer; stream) end end @@ -196,12 +194,12 @@ function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T, buf = try typ = memory_type(ptr) if is_managed(ptr) - Mem.UnifiedBuffer(ptr, sz) + Mem.UnifiedBuffer(ctx, ptr, sz) elseif typ == CU_MEMORYTYPE_DEVICE # TODO: can we identify whether this pointer was allocated asynchronously? - Mem.DeviceBuffer(ptr, sz, false) + Mem.DeviceBuffer(ctx, ptr, sz, false) elseif typ == CU_MEMORYTYPE_HOST - Mem.HostBuffer(host_pointer(ptr), sz) + Mem.HostBuffer(ctx, host_pointer(ptr), sz) else error("Unknown memory type; please file an issue.") end @@ -209,7 +207,7 @@ function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T, error("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?") end - storage = ArrayStorage(buf, ctx, own ? 1 : -1) + storage = ArrayStorage(buf, own ? 1 : -1) CuArray{T, length(dims)}(storage, dims) end @@ -232,12 +230,12 @@ Base.sizeof(x::CuArray) = Base.elsize(x) * length(x) function context(A::CuArray) A.storage === nothing && throw(UndefRefError()) - return A.storage.ctx + return A.storage.buffer.ctx end function device(A::CuArray) A.storage === nothing && throw(UndefRefError()) - return device(A.storage.ctx) + return device(A.storage.buffer.ctx) end @@ -826,14 +824,14 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T maxsize end - new_storage = context!(A.storage.ctx) do + new_storage = context!(context(A)) do buf = alloc(typeof(A.storage.buffer), bufsize) ptr = convert(CuPtr{T}, buf) m = min(length(A), n) if m > 0 unsafe_copyto!(ptr, pointer(A), m) end - ArrayStorage(buf, A.storage.ctx, 1) + ArrayStorage(buf, 1) end unsafe_free!(A) diff --git a/test/array.jl b/test/array.jl index 37184d793e..f6b65d6383 100644 --- a/test/array.jl +++ b/test/array.jl @@ -36,7 +36,7 @@ import Adapt @test eltype(a) == eltype(b) @test ndims(a) == ndims(b) @test a.storage.buffer.ptr == b.storage.buffer.ptr - @test a.storage.ctx == b.storage.ctx + @test a.storage.buffer.ctx == b.storage.buffer.ctx @test a.maxsize == b.maxsize @test a.offset == b.offset @test a.dims == b.dims diff --git a/test/pointer.jl b/test/pointer.jl index 2368114ed9..fe78cc6271 100644 --- a/test/pointer.jl +++ b/test/pointer.jl @@ -41,7 +41,7 @@ ccall(:clock, Nothing, (Ptr{Int},), a) @test_throws Exception ccall(:clock, Nothing, (CuPtr{Int},), a) ccall(:clock, Nothing, (PtrOrCuPtr{Int},), a) -b = CuArray{eltype(a), ndims(a)}(CUDA.ArrayStorage(Mem.DeviceBuffer(), context(), 0), size(a)) +b = CuArray{eltype(a), ndims(a)}(CUDA.ArrayStorage(Mem.DeviceBuffer(), 0), size(a)) ccall(:clock, Nothing, (CuPtr{Int},), b) @test_throws Exception ccall(:clock, Nothing, (Ptr{Int},), b) ccall(:clock, Nothing, (PtrOrCuPtr{Int},), b) From 92b5fe2e4aa894013ddc9c36493a0ece4db08f9c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 17 Feb 2022 12:53:17 +0100 Subject: [PATCH 3/4] Only switch contexts during finalization, not when early freeing. --- lib/cufft/fft.jl | 10 ++++++---- src/array.jl | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/cufft/fft.jl b/lib/cufft/fft.jl index a2e03f75ac..e66e576480 100644 --- a/lib/cufft/fft.jl +++ b/lib/cufft/fft.jl @@ -25,13 +25,15 @@ abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end Base.convert(::Type{cufftHandle}, p::CuFFTPlan) = p.handle function CUDA.unsafe_free!(plan::CuFFTPlan, stream::CuStream=stream()) - context!(plan.ctx; skip_destroyed=true) do - cufftDestroy(plan) - end + cufftDestroy(plan) unsafe_free!(plan.workarea, stream) end -unsafe_finalize!(plan::CuFFTPlan) = unsafe_free!(plan, default_stream()) +function unsafe_finalize!(plan::CuFFTPlan) + context!(plan.ctx; skip_destroyed=true) do + unsafe_free!(plan, default_stream()) + end +end mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace} handle::cufftHandle diff --git a/src/array.jl b/src/array.jl index 2dd7686586..72fe68e346 100644 --- a/src/array.jl +++ b/src/array.jl @@ -97,6 +97,8 @@ function unsafe_finalize!(xs::CuArray) # streams involved, or by refcounting uses and decrementing that refcount after the # operation using `cuLaunchHostFunc`. See CUDA.jl#778 and CUDA.jl#780 for details. unsafe_free!(xs, default_stream()) + # NOTE: we don't switch contexts here, but in unsafe_free!, as arrays are refcounted + # and we may not have to free the memory yet. end From ea43c67c12bc9c9a7a1ea8b035a2430993f2839a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 17 Feb 2022 13:23:25 +0100 Subject: [PATCH 4/4] Don't inspect the active state during pool free. Avoids querying the current context, which may cause a task switch. --- lib/cudadrv/context.jl | 2 ++ src/pool.jl | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lib/cudadrv/context.jl b/lib/cudadrv/context.jl index bdb7996dda..cb5462179a 100644 --- a/lib/cudadrv/context.jl +++ b/lib/cudadrv/context.jl @@ -39,6 +39,8 @@ mutable struct CuContext valid::Bool function new_unique(handle) + # XXX: this makes it dangerous to call this function from finalizers. + # can we do without the lock? Base.@lock context_lock get!(valid_contexts, handle) do new(handle, true) end diff --git a/src/pool.jl b/src/pool.jl index 378e98b37e..f16e362839 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -308,12 +308,27 @@ Releases a buffer `buf` to the memory pool. return end @inline function _free(buf::Mem.DeviceBuffer; stream::Union{Nothing,CuStream}) - state = active_state() - if stream_ordered(state.device) + # NOTE: this function is often called from finalizers, from which we can't switch tasks, + # so we need to take care not to call managed functions (i.e. functions that may + # initialize the CUDA context) because querying the active context using + # `current_context()` takes a lock + + # verify that the caller has called `context!` already, which eagerly activates the + # context (i.e. doesn't only set it in the state, but configures the CUDA APIs). + handle_ref = Ref{CUcontext}() + cuCtxGetCurrent(handle_ref) + if buf.ctx.handle != handle_ref[] + error("Trying to free $buf from a different context than the one it was allocated from ($(handle_ref[]))") + end + + dev = current_device() + if stream_ordered(dev) # mark the pool as active - pool_mark(state.device) + pool_mark(dev) - actual_free(buf; stream=something(stream, state.stream)) + # for safety, we default to the default stream and force this operation to be ordered + # against all other streams. to opt out of this, pass a specific stream instead. + actual_free(buf; stream=something(stream, default_stream())) else actual_free(buf) end