-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcuda_bench.jl
79 lines (71 loc) · 2.88 KB
/
cuda_bench.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
using BenchmarkTools, CUDA
DAT = Float64
if DAT==Float64
DAT_Int = Int64
sc = 1
elseif DAT==Float32
DAT_Int = Int32
sc = 2
end
@inbounds function memcopy_triad!(A, B, C, s)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
A[ix,iy] = B[ix,iy] + s*C[ix,iy]
return
end
@inbounds function memcopy_triad_pow_int!(A, B, C, s, pow_int)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
A[ix,iy] = B[ix,iy] + s*C[ix,iy]^pow_int
return
end
@inbounds function memcopy_triad_pow_float!(A, B, C, s, pow_float)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
A[ix,iy] = B[ix,iy] + s*C[ix,iy]^pow_float
return
end
function diff2D_step!(T2, T, Ci, lam, dt, _dx, _dy)
ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
@inbounds T2[ix,iy] = T[ix,iy] + dt*(Ci[ix,iy]*(
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy ))
end
return
end
function run_bench()
fact = 32
nx, ny = sc*fact*1024, fact*1024
threads = (32, 8)
blocks = (nx÷threads[1], ny÷threads[2])
A = CUDA.zeros(DAT, nx, ny)
B = CUDA.rand(DAT, nx, ny)
C = CUDA.ones(DAT, nx, ny)
pow_int = DAT_Int(3)
pow_float = DAT(3.75)
s = rand(DAT)
lam = rand(DAT)
_dx, _dy = DAT(1.0), DAT(1.0)
dt = DAT(1.0/10.0/4.1)
println("nx, ny, DAT = $(nx), $(ny), $(DAT)")
# run test 1
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads memcopy_triad!($A, $B, $C, $s); synchronize() end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot triad2D = $(round(T_tot,sigdigits=7)) GB/s")
# run test 2
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads memcopy_triad_pow_int!($A, $B, $C, $s, $pow_int); synchronize() end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot triad2D pow_int = $(round(T_tot,sigdigits=7)) GB/s")
# run test 3
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads memcopy_triad_pow_float!($A, $B, $C, $s, $pow_float); synchronize() end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot triad2D pow_float = $(round(T_tot,sigdigits=7)) GB/s")
# run test 4
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy); synchronize() end
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it
println("T_tot diffusion 2D = $(round(T_tot,sigdigits=7)) GB/s")
return
end
run_bench()