Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider running GC when allocating and synchronizing #2304

Merged
merged 7 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/src/lib/driver.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ methods then work with these raw pointers:

```@docs
CUDA.memory_status
CUDA.available_memory
CUDA.free_memory
CUDA.total_memory
```

Expand Down
6 changes: 3 additions & 3 deletions lib/cudadrv/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -767,11 +767,11 @@ end
end # module Mem

"""
available_memory()
free_memory()
Returns the available amount of memory (in bytes), available for allocation by the CUDA context.
Returns the free amount of memory (in bytes), available for allocation by the CUDA context.
"""
available_memory() = Mem.info()[1]
free_memory() = Mem.info()[1]

"""
total_memory()
Expand Down
9 changes: 9 additions & 0 deletions lib/cudadrv/synchronization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,11 @@ function device_synchronize(; blocking::Bool=false, spin::Bool=true)
if spin && spinning_synchronization(isdone, legacy_stream())
cuCtxSynchronize()
else
maybe_collect(true)
nonblocking_synchronize(context())
end
else
maybe_collect(true)
cuCtxSynchronize()
end

Expand All @@ -196,9 +198,11 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool
if spin && spinning_synchronization(isdone, stream)
cuStreamSynchronize(stream)
else
maybe_collect(true)
nonblocking_synchronize(stream)
end
else
maybe_collect(true)
cuStreamSynchronize(stream)
end

Expand All @@ -210,9 +214,11 @@ function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if spin && spinning_synchronization(isdone, event)
cuEventSynchronize(event)
else
maybe_collect(true)
nonblocking_synchronize(event)
end
else
maybe_collect(true)
cuEventSynchronize(event)
end
end
Expand Down Expand Up @@ -269,6 +275,7 @@ function device_synchronize(; blocking::Bool=false, spin::Bool=true)
nonblocking_synchronize(stream)
end
end
maybe_collect(true)
cuCtxSynchronize()

check_exceptions()
Expand All @@ -280,6 +287,7 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool
nonblocking_synchronize(stream)
end
end
maybe_collect(true)
cuStreamSynchronize(stream)

check_exceptions()
Expand All @@ -289,6 +297,7 @@ function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
spin && spinning_synchronization(isdone, event)
end
maybe_collect(true)
cuEventSynchronize(event)
end

Expand Down
2 changes: 1 addition & 1 deletion lib/cudnn/src/convolution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,6 @@ function cudnnFindConvolutionAlgorithmWorkspaceSize(x)
# Because algorithm discovery runs infrequently yet allocates more than conv functions,
# This is a good place to synchronize and trim the memory pool to reduce fragmentation.
CUDA.reclaim()
gpufree = CUDA.available_memory() + coalesce(CUDA.cached_memory(), 0)
gpufree = CUDA.free_memory() + coalesce(CUDA.cached_memory(), 0)
min(gpufree ÷ 10, sizeof(x) * 100)
end
1 change: 1 addition & 0 deletions src/CUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ export CUDABackend
# StaticArrays is still a direct dependency, so directly include the extension
include("../ext/StaticArraysExt.jl")

include("deprecated.jl")
include("precompile.jl")

end
1 change: 1 addition & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@deprecate available_memory() free_memory()
4 changes: 2 additions & 2 deletions src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ function __init__()
return
end

if driver < v"11.2"
if driver < v"11.3"
@warn """The NVIDIA driver on this system only supports up to CUDA $driver.
For performance reasons, it is recommended to upgrade to a driver that supports CUDA 11.2 or higher."""
For performance reasons, it is recommended to upgrade to a driver that supports CUDA 11.3 or higher."""
end

# check that we have a runtime
Expand Down
Loading