Skip to content

Commit

Permalink
[SYCL][Graph] Port sycl/test/graph tests to sycl/test-e2e/Graph
Browse files Browse the repository at this point in the history
We've treating these tests are runtime tests, so move them to test-e2e which the following changes:

* Prefer device USM to shared USM. Device USM support is mandatory while shared is optional

* Change `TestQueue` variable name to `Queue`

* Comment new tests and remove buffer copy back behaviour.

* Fix test name mismatch between recording & explicit.

* Rename `Explicit/whole_graph_update_ordering.cpp` -> `Explicit/executable_graph_update_ordering.cpp`

* Introduce a record & replay saxypy test.

* Add more host, shared, and system USM tests.

* Move dotp reductions to their own tests
  • Loading branch information
EwanC authored May 16, 2023
1 parent 316c3a3 commit cdfd8b6
Show file tree
Hide file tree
Showing 98 changed files with 3,100 additions and 2,171 deletions.
42 changes: 21 additions & 21 deletions sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "../graph_common.hpp"

int main() {
queue TestQueue;
queue Queue;

using T = unsigned int;

Expand All @@ -30,18 +30,18 @@ int main() {
}
}

exp_ext::command_graph Graph{TestQueue.get_context(), TestQueue.get_device()};
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, TestQueue);
T *PtrB = malloc_device<T>(Size, TestQueue);
T *PtrC = malloc_device<T>(Size, TestQueue);
T *PtrOut = malloc_device<T>(Size, TestQueue);
T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);
T *PtrOut = malloc_device<T>(Size, Queue);

TestQueue.copy(DataA.data(), PtrA, Size);
TestQueue.copy(DataB.data(), PtrB, Size);
TestQueue.copy(DataC.data(), PtrC, Size);
TestQueue.copy(DataOut.data(), PtrOut, Size);
TestQueue.wait_and_throw();
Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.copy(DataOut.data(), PtrOut, Size);
Queue.wait_and_throw();

auto NodeA = Graph.add([&](handler &CGH) {
CGH.parallel_for(range<1>(Size),
Expand All @@ -61,29 +61,29 @@ int main() {

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event = TestQueue.submit([&](handler &CGH) {
Event = Queue.submit([&](handler &CGH) {
CGH.depends_on(Event);
CGH.ext_oneapi_graph(GraphExec);
});
}

for (unsigned n = 0; n < Iterations; n++) {
Event = TestQueue.submit([&](handler &CGH) {
Event = Queue.submit([&](handler &CGH) {
CGH.depends_on(Event);
CGH.ext_oneapi_graph(GraphExecAdditional);
});
}

TestQueue.wait_and_throw();
Queue.wait_and_throw();

TestQueue.copy(PtrC, DataC.data(), Size);
TestQueue.copy(PtrOut, DataOut.data(), Size);
TestQueue.wait_and_throw();
Queue.copy(PtrC, DataC.data(), Size);
Queue.copy(PtrOut, DataOut.data(), Size);
Queue.wait_and_throw();

free(PtrA, TestQueue);
free(PtrB, TestQueue);
free(PtrC, TestQueue);
free(PtrOut, TestQueue);
free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);
free(PtrOut, Queue);

assert(ReferenceC == DataC);
assert(ReferenceOut == DataOut);
Expand Down
9 changes: 4 additions & 5 deletions sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "../graph_common.hpp"

int main() {
queue TestQueue;
queue Queue;

using T = unsigned short;

Expand All @@ -23,8 +23,7 @@ int main() {
ReferenceC);

{
exp_ext::command_graph Graph{TestQueue.get_context(),
TestQueue.get_device()};
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
buffer<T> BufferA{DataA.data(), range<1>{DataA.size()}};
BufferA.set_write_back(false);
buffer<T> BufferB{DataB.data(), range<1>{DataB.size()}};
Expand All @@ -39,12 +38,12 @@ int main() {

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event = TestQueue.submit([&](handler &CGH) {
Event = Queue.submit([&](handler &CGH) {
CGH.depends_on(Event);
CGH.ext_oneapi_graph(GraphExec);
});
}
TestQueue.wait_and_throw();
Queue.wait_and_throw();

host_accessor HostAccA(BufferA);
host_accessor HostAccB(BufferB);
Expand Down
36 changes: 18 additions & 18 deletions sycl/test-e2e/Graph/Explicit/basic_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "../graph_common.hpp"

int main() {
queue TestQueue;
queue Queue;

using T = int;

Expand All @@ -22,16 +22,16 @@ int main() {
calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
ReferenceC);

exp_ext::command_graph Graph{TestQueue.get_context(), TestQueue.get_device()};
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, TestQueue);
T *PtrB = malloc_device<T>(Size, TestQueue);
T *PtrC = malloc_device<T>(Size, TestQueue);
T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

TestQueue.copy(DataA.data(), PtrA, Size);
TestQueue.copy(DataB.data(), PtrB, Size);
TestQueue.copy(DataC.data(), PtrC, Size);
TestQueue.wait_and_throw();
Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

// Add commands to graph
add_kernels_usm(Graph, Size, PtrA, PtrB, PtrC);
Expand All @@ -40,22 +40,22 @@ int main() {

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event = TestQueue.submit([&](handler &CGH) {
Event = Queue.submit([&](handler &CGH) {
CGH.depends_on(Event);
CGH.ext_oneapi_graph(GraphExec);
});
}

TestQueue.wait_and_throw();
Queue.wait_and_throw();

TestQueue.copy(PtrA, DataA.data(), Size);
TestQueue.copy(PtrB, DataB.data(), Size);
TestQueue.copy(PtrC, DataC.data(), Size);
TestQueue.wait_and_throw();
Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.copy(PtrC, DataC.data(), Size);
Queue.wait_and_throw();

free(PtrA, TestQueue);
free(PtrB, TestQueue);
free(PtrC, TestQueue);
free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

assert(ReferenceA == DataA);
assert(ReferenceB == DataB);
Expand Down
75 changes: 75 additions & 0 deletions sycl/test-e2e/Graph/Explicit/dotp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// REQUIRES: level_zero, gpu
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// Tests constructing a graph using the explicit API to perform a dotp
// operation using USM memory.

#include "../graph_common.hpp"

int main() {
queue Queue{gpu_selector_v};

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

float *Dotp = malloc_device<float>(1, Queue);

const size_t N = 10;
float *X = malloc_device<float>(N, Queue);
float *Y = malloc_device<float>(N, Queue);
float *Z = malloc_device<float>(N, Queue);

auto NodeI = Graph.add([&](handler &CGH) {
CGH.parallel_for(N, [=](id<1> it) {
const size_t i = it[0];
X[i] = 1.0f;
Y[i] = 2.0f;
Z[i] = 3.0f;
});
});

auto NodeA = Graph.add(
[&](handler &CGH) {
CGH.parallel_for(range<1>{N}, [=](id<1> it) {
const size_t i = it[0];
X[i] = Alpha * X[i] + Beta * Y[i];
});
},
{exp_ext::property::node::depends_on(NodeI)});

auto NodeB = Graph.add(
[&](handler &CGH) {
CGH.parallel_for(range<1>{N}, [=](id<1> it) {
const size_t i = it[0];
Z[i] = Gamma * Z[i] + Beta * Y[i];
});
},
{exp_ext::property::node::depends_on(NodeI)});

auto NodeC = Graph.add(
[&](handler &CGH) {
CGH.single_task([=]() {
for (size_t j = 0; j < N; j++) {
Dotp[0] += X[j] * Z[j];
}
});
},
{exp_ext::property::node::depends_on(NodeA, NodeB)});

auto ExecGraph = Graph.finalize();

// Using shortcut for executing a graph of commands
Queue.ext_oneapi_graph(ExecGraph).wait();

float Output;
Queue.memcpy(&Output, Dotp, sizeof(float)).wait();

assert(Output == dotp_reference_result(N));

sycl::free(Dotp, Queue);
sycl::free(X, Queue);
sycl::free(Y, Queue);
sycl::free(Z, Queue);

return 0;
}
85 changes: 85 additions & 0 deletions sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// REQUIRES: level_zero, gpu
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// Tests creating a dotp operation through explicit graph creation with
// buffers.

#include "../graph_common.hpp"

int main() {

queue Queue{gpu_selector_v};

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

float DotpData = 0.f;

const size_t N = 10;
std::vector<float> XData(N);
std::vector<float> YData(N);
std::vector<float> ZData(N);

{
buffer DotpBuf(&DotpData, range<1>(1));
DotpBuf.set_write_back(false);

buffer XBuf(XData);
XBuf.set_write_back(false);
buffer YBuf(YData);
YBuf.set_write_back(false);
buffer ZBuf(ZData);
ZBuf.set_write_back(false);

auto NodeI = Graph.add([&](handler &CGH) {
auto X = XBuf.get_access(CGH);
auto Y = YBuf.get_access(CGH);
auto Z = ZBuf.get_access(CGH);
CGH.parallel_for(N, [=](id<1> it) {
const size_t i = it[0];
X[i] = 1.0f;
Y[i] = 2.0f;
Z[i] = 3.0f;
});
});

auto NodeA = Graph.add([&](handler &CGH) {
auto X = XBuf.get_access(CGH);
auto Y = YBuf.get_access(CGH);
CGH.parallel_for(range<1>{N}, [=](id<1> it) {
const size_t i = it[0];
X[i] = Alpha * X[i] + Beta * Y[i];
});
});

auto NodeB = Graph.add([&](handler &CGH) {
auto Y = YBuf.get_access(CGH);
auto Z = ZBuf.get_access(CGH);
CGH.parallel_for(range<1>{N}, [=](id<1> it) {
const size_t i = it[0];
Z[i] = Gamma * Z[i] + Beta * Y[i];
});
});

auto NodeC = Graph.add([&](handler &CGH) {
auto Dotp = DotpBuf.get_access(CGH);
auto X = XBuf.get_access(CGH);
auto Z = ZBuf.get_access(CGH);
CGH.single_task([=]() {
for (size_t j = 0; j < N; j++) {
Dotp[0] += X[j] * Z[j];
}
});
});

auto ExecGraph = Graph.finalize();

// Using shortcut for executing a graph of commands
Queue.ext_oneapi_graph(ExecGraph).wait();

host_accessor HostAcc(DotpBuf);
assert(HostAcc[0] == dotp_reference_result(N));
}

return 0;
}
Loading

0 comments on commit cdfd8b6

Please sign in to comment.