-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenkernel.nim
36 lines (29 loc) · 993 Bytes
/
genkernel.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import cuda
type gpuArray[T] = distinct ptr array[0,T]
template `[]`(x: gpuArray, i: SomeInteger): untyped =
(ptr array[0,x.T])(x)[][i]
template `[]=`(x: gpuArray, i: SomeInteger, y: untyped): untyped =
(ptr array[0,x.T])(x)[][i] = y
proc alloc(a: var gpuArray, n: int) =
let err = cudaMalloc(a, n*sizeof(a.T))
echo "alloc err: ", err
proc newGpuArray[T](n: int): gpuArray[T] =
var p: pointer
let err = cudaMalloc(p.addr, n*sizeof(T))
let pa = cast[ptr array[0,T]](p)
result = (type(result))(pa)
if err:
echo "alloc err: ", err
quit(-1)
proc timesTwo[T](a: gpuArray[T]; n: int32) {.cudaGlobal.} =
var i = blockDim.x * blockIdx.x + threadIdx.x
if i < n:
a[i] *= T(2)
var
n = 10000.int32
a = newGpuArray[float32](n)
b = newGpuArray[float64](n)
var threadsPerBlock: int32 = 256
var blocksPerGrid: int32 = (n + threadsPerBlock - 1) div threadsPerBlock
timesTwo<<(blocksPerGrid,threadsPerBlock)>>(a,n)
timesTwo<<(blocksPerGrid,threadsPerBlock)>>(b,n)