Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Add empty node implementation #106

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class __SYCL_EXPORT command_graph {
command_graph(const property_list &propList = {});

// Adding empty node with [0..n] predecessors:
node add(const std::vector<node> &dep = {});
node add(const std::vector<node> &dep = {}) { return add_impl(dep); }

// Adding device node:
template <typename T> node add(T cgf, const std::vector<node> &dep = {}) {
Expand Down
6 changes: 6 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
class handler;
template <typename T, int Dimensions, typename AllocatorT, typename Enable>
class buffer;

namespace ext::oneapi::experimental::detail {
class graph_impl;
}
namespace detail {

class handler_impl;
Expand Down Expand Up @@ -2611,6 +2615,8 @@ class __SYCL_EXPORT handler {
friend class ::MockHandler;
friend class detail::queue_impl;

friend class ext::oneapi::experimental::detail::graph_impl;

bool DisableRangeRounding();

bool RangeRoundingTrace();
Expand Down
140 changes: 104 additions & 36 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <detail/graph_impl.hpp>
#include <detail/queue_impl.hpp>
#include <detail/scheduler/commands.hpp>
#include <sycl/queue.hpp>

namespace sycl {
Expand Down Expand Up @@ -61,76 +62,117 @@ void graph_impl::remove_root(const std::shared_ptr<node_impl> &Root) {
// Recursive check if a graph node or its successors contains a given kernel
// argument.
//
// @param[in] arg The kernel argument to check for.
// @param[in] currentNode The current graph node being checked.
// @param[in,out] deps The unique list of dependencies which have been
// @param[in] Arg The kernel argument to check for.
// @param[in] CurrentNode The current graph node being checked.
// @param[in,out] Deps The unique list of dependencies which have been
// identified for this arg.
// @param[in] dereferencePtr if true arg comes direct from the handler in which
// case it will need to be deferenced to check actual value.
//
// @returns True if a dependency was added in this node of any of its
// successors.
bool check_for_arg(const sycl::detail::ArgDesc &Arg,
const std::shared_ptr<node_impl> &CurrentNode,
std::set<std::shared_ptr<node_impl>> &Deps,
bool DereferencePtr = false) {
std::set<std::shared_ptr<node_impl>> &Deps) {
bool SuccessorAddedDep = false;
for (auto &Successor : CurrentNode->MSuccessors) {
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps, DereferencePtr);
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps);
}

if (Deps.find(CurrentNode) == Deps.end() &&
CurrentNode->has_arg(Arg, DereferencePtr) && !SuccessorAddedDep) {
if (Deps.find(CurrentNode) == Deps.end() && CurrentNode->has_arg(Arg) &&
!SuccessorAddedDep) {
Deps.insert(CurrentNode);
return true;
}
return SuccessorAddedDep;
}

template <typename T>
std::shared_ptr<node_impl>
graph_impl::add(const std::shared_ptr<graph_impl> &impl, T cgf,
const std::vector<sycl::detail::ArgDesc> &args,
const std::vector<std::shared_ptr<node_impl>> &dep) {
std::shared_ptr<node_impl> nodeImpl =
std::make_shared<node_impl>(impl, cgf, args);
graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
const std::shared_ptr<node_impl> &NodeImpl =
std::make_shared<node_impl>(Impl);

// TODO: Encapsulate in separate function to avoid duplication
if (!Dep.empty()) {
for (auto N : Dep) {
N->register_successor(NodeImpl); // register successor
this->remove_root(NodeImpl); // remove receiver from root node
// list
}
} else {
this->add_root(NodeImpl);
}

return NodeImpl;
}

std::shared_ptr<node_impl>
graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
sycl::queue TempQueue{};
auto QueueImpl = sycl::detail::getSyclObjImpl(TempQueue);
QueueImpl->setCommandGraph(Impl);
sycl::handler Handler{QueueImpl, false};
CGF(Handler);

return this->add(Impl, Handler.MKernel, Handler.MNDRDesc,
Handler.MOSModuleHandle, Handler.MKernelName,
Handler.MAccStorage, Handler.MLocalAccStorage,
Handler.MRequirements, Handler.MArgs, {});
}

std::shared_ptr<node_impl> graph_impl::add(
const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle,
std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>(
Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage,
LocalAccStorage, Requirements, Args);
// Copy deps so we can modify them
auto deps = dep;
auto Deps = Dep;
// A unique set of dependencies obtained by checking kernel arguments
std::set<std::shared_ptr<node_impl>> uniqueDeps;
for (auto &arg : args) {
if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) {
// for accessors
std::set<std::shared_ptr<node_impl>> UniqueDeps;
for (auto &Arg : Args) {
if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) {
continue;
}
// Look through the graph for nodes which share this argument
for (auto nodePtr : MRoots) {
check_for_arg(arg, nodePtr, uniqueDeps, true);
for (auto NodePtr : MRoots) {
check_for_arg(Arg, NodePtr, UniqueDeps);
}
}

// Add any deps determined from arguments into the dependency list
deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end());
if (!deps.empty()) {
for (auto n : deps) {
n->register_successor(nodeImpl); // register successor
this->remove_root(nodeImpl); // remove receiver from root node
// Add any deps determined from accessor arguments into the dependency list
Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
if (!Deps.empty()) {
for (auto N : Deps) {
N->register_successor(NodeImpl); // register successor
this->remove_root(NodeImpl); // remove receiver from root node
// list
}
} else {
this->add_root(nodeImpl);
this->add_root(NodeImpl);
}
return nodeImpl;
return NodeImpl;
}

bool graph_impl::clear_queues() {
bool anyQueuesCleared = false;
for (auto &q : MRecordingQueues) {
q->setCommandGraph(nullptr);
anyQueuesCleared = true;
bool AnyQueuesCleared = false;
for (auto &Queue : MRecordingQueues) {
Queue->setCommandGraph(nullptr);
AnyQueuesCleared = true;
}
MRecordingQueues.clear();

return anyQueuesCleared;
return AnyQueuesCleared;
}

void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
Expand All @@ -139,7 +181,33 @@ void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
for (auto Sender : MPredecessors)
Deps.push_back(Sender->get_event());

MEvent = Queue->submit(wrapper{MBody, Deps}, Queue _CODELOCFW(CodeLoc));
// Enqueue kernel here instead of submit

std::vector<pi_event> RawEvents;
pi_event *OutEvent = nullptr;
auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
NewEvent->setContextImpl(Queue->getContextImplPtr());
NewEvent->setStateIncomplete();
OutEvent = &NewEvent->getHandleRef();
pi_result Res =
Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
OutEvent);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::event,
"Failed to create event for node submission");
}

pi_int32 Result = enqueueImpKernel(
Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel,
MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr);
if (Result != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel");
}
sycl::event QueueEvent =
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
Queue->addEvent(QueueEvent);
MEvent = QueueEvent;
}
} // namespace detail

Expand Down
92 changes: 72 additions & 20 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
#pragma once

#include <sycl/detail/cg_types.hpp>
#include <sycl/detail/os_util.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/handler.hpp>

#include <detail/kernel_impl.hpp>

#include <cstring>
#include <functional>
#include <list>
#include <set>
Expand Down Expand Up @@ -48,9 +52,24 @@ struct node_impl {
std::vector<std::shared_ptr<node_impl>> MSuccessors;
std::vector<std::shared_ptr<node_impl>> MPredecessors;

std::function<void(sycl::handler &)> MBody;

/// Kernel to be executed by this node
std::shared_ptr<sycl::detail::kernel_impl> MKernel;
/// Description of the kernel global and local sizes as well as offset
sycl::detail::NDRDescT MNDRDesc;
/// Module handle for the kernel to be executed.
sycl::detail::OSModuleHandle MOSModuleHandle =
sycl::detail::OSUtil::ExeModuleHandle;
/// Kernel name inside the module
std::string MKernelName;
std::vector<sycl::detail::AccessorImplPtr> MAccStorage;
std::vector<sycl::detail::LocalAccessorImplPtr> MLocalAccStorage;
std::vector<sycl::detail::AccessorImplHost *> MRequirements;

/// Store arg descriptors for the kernel arguments
std::vector<sycl::detail::ArgDesc> MArgs;
// We need to store local copies of the values pointed to by MArgssince they
// may go out of scope before execution.
std::vector<std::vector<std::byte>> MArgStorage;

void exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
_CODELOCPARAM(&CodeLoc));
Expand All @@ -66,17 +85,33 @@ struct node_impl {

sycl::event get_event(void) const { return MEvent; }

template <typename T>
node_impl(const std::shared_ptr<graph_impl> &Graph, T CGF,
const std::vector<sycl::detail::ArgDesc> &Args)
: MScheduled(false), MGraph(Graph), MBody(CGF), MArgs(Args) {
node_impl(const std::shared_ptr<graph_impl> &Graph)
: MScheduled(false), MGraph(Graph) {}

node_impl(
const std::shared_ptr<graph_impl> &Graph,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &args)
: MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
MOSModuleHandle(OSModuleHandle), MKernelName(KernelName),
MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage),
MRequirements(Requirements), MArgs(args), MArgStorage() {

// Need to copy the arg values to node local storage so that they don't go
// out of scope before execution
for (size_t i = 0; i < MArgs.size(); i++) {
if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) {
// Make sure we are storing the actual USM pointer for comparison
// purposes, note we couldn't actually submit using these copies of the
// args if subsequent code expects a void**.
MArgs[i].MPtr = *(void **)(MArgs[i].MPtr);
}
auto &CurrentArg = MArgs[i];
MArgStorage.emplace_back(CurrentArg.MSize);
auto StoragePtr = MArgStorage.back().data();
if (CurrentArg.MPtr)
std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize);
// Set the arg descriptor to point to the new storage
CurrentArg.MPtr = StoragePtr;
}
}

Expand All @@ -87,16 +122,18 @@ struct node_impl {
if (!Next->MScheduled)
Next->topology_sort(Schedule);
}
Schedule.push_front(std::shared_ptr<node_impl>(this));
if (MKernel != nullptr)
Schedule.push_front(std::shared_ptr<node_impl>(this));
}

bool has_arg(const sycl::detail::ArgDesc &Arg, bool DereferencePtr = false) {
bool has_arg(const sycl::detail::ArgDesc &Arg) {
for (auto &NodeArg : MArgs) {
if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) {
// Args coming directly from the handler will need to be dereferenced
// since they are actually void**
void *IncomingPtr = DereferencePtr ? *(void **)Arg.MPtr : Arg.MPtr;
if (IncomingPtr == NodeArg.MPtr) {
// Args are actually void** so we need to dereference them to compare
// actual values
void *IncomingPtr = *static_cast<void **>(Arg.MPtr);
void *ArgPtr = *static_cast<void **>(NodeArg.MPtr);
if (IncomingPtr == ArgPtr) {
return true;
}
}
Expand All @@ -119,12 +156,27 @@ struct graph_impl {
void add_root(const std::shared_ptr<node_impl> &);
void remove_root(const std::shared_ptr<node_impl> &);

template <typename T>
std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl, T CGF,
add(const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

graph_impl() : MFirst(true) {}

/// Add a queue to the set of queues which are currently recording to this
Expand Down
Loading