Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Remove CGF reuse in graph nodes #87

Merged
merged 9 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
class handler;
template <typename T, int Dimensions, typename AllocatorT, typename Enable>
class buffer;

namespace ext::oneapi::experimental::detail {
class graph_impl;
}
namespace detail {

class handler_impl;
Expand Down Expand Up @@ -372,6 +376,14 @@ class __SYCL_EXPORT handler {
std::shared_ptr<detail::queue_impl> PrimaryQueue,
std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);

/// Constructs SYCL handler from Graph.
///
/// The hander will add the command-group as a node to the graph rather than
/// enqueueing it straight away.
///
/// \param Graph is a SYCL command_graph
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);

/// Stores copy of Arg passed to the MArgsStorage.
template <typename T, typename F = typename detail::remove_const_t<
typename detail::remove_reference_t<T>>>
Expand Down Expand Up @@ -2528,6 +2540,8 @@ class __SYCL_EXPORT handler {
private:
std::shared_ptr<detail::handler_impl> MImpl;
std::shared_ptr<detail::queue_impl> MQueue;
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;

/// The storage for the arguments passed.
/// We need to store a copy of values that are passed explicitly through
/// set_arg, require and so on, because we need them to be alive after
Expand Down Expand Up @@ -2611,6 +2625,8 @@ class __SYCL_EXPORT handler {
friend class ::MockHandler;
friend class detail::queue_impl;

friend class ext::oneapi::experimental::detail::graph_impl;

bool DisableRangeRounding();

bool RangeRoundingTrace();
Expand Down
117 changes: 81 additions & 36 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <detail/graph_impl.hpp>
#include <detail/queue_impl.hpp>
#include <detail/scheduler/commands.hpp>
#include <sycl/queue.hpp>

namespace sycl {
Expand Down Expand Up @@ -61,76 +62,94 @@ void graph_impl::remove_root(const std::shared_ptr<node_impl> &Root) {
// Recursive check if a graph node or its successors contains a given kernel
// argument.
//
// @param[in] arg The kernel argument to check for.
// @param[in] currentNode The current graph node being checked.
// @param[in,out] deps The unique list of dependencies which have been
// @param[in] Arg The kernel argument to check for.
// @param[in] CurrentNode The current graph node being checked.
// @param[in,out] Deps The unique list of dependencies which have been
// identified for this arg.
// @param[in] dereferencePtr if true arg comes direct from the handler in which
// case it will need to be deferenced to check actual value.
//
// @returns True if a dependency was added in this node of any of its
// successors.
bool check_for_arg(const sycl::detail::ArgDesc &Arg,
const std::shared_ptr<node_impl> &CurrentNode,
std::set<std::shared_ptr<node_impl>> &Deps,
bool DereferencePtr = false) {
std::set<std::shared_ptr<node_impl>> &Deps) {
bool SuccessorAddedDep = false;
for (auto &Successor : CurrentNode->MSuccessors) {
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps, DereferencePtr);
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps);
}

if (Deps.find(CurrentNode) == Deps.end() &&
CurrentNode->has_arg(Arg, DereferencePtr) && !SuccessorAddedDep) {
if (Deps.find(CurrentNode) == Deps.end() && CurrentNode->has_arg(Arg) &&
!SuccessorAddedDep) {
Deps.insert(CurrentNode);
return true;
}
return SuccessorAddedDep;
}

template <typename T>
std::shared_ptr<node_impl>
graph_impl::add(const std::shared_ptr<graph_impl> &impl, T cgf,
const std::vector<sycl::detail::ArgDesc> &args,
const std::vector<std::shared_ptr<node_impl>> &dep) {
std::shared_ptr<node_impl> nodeImpl =
std::make_shared<node_impl>(impl, cgf, args);
graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
sycl::handler Handler{Impl};
CGF(Handler);

return this->add(Impl, Handler.MKernel, Handler.MNDRDesc,
Handler.MOSModuleHandle, Handler.MKernelName,
Handler.MAccStorage, Handler.MLocalAccStorage,
Handler.MRequirements, Handler.MArgs, {});
}

std::shared_ptr<node_impl> graph_impl::add(
const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle,
std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>(
Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage,
LocalAccStorage, Requirements, Args);
// Copy deps so we can modify them
auto deps = dep;
auto Deps = Dep;
// A unique set of dependencies obtained by checking kernel arguments
std::set<std::shared_ptr<node_impl>> uniqueDeps;
for (auto &arg : args) {
if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) {
// for accessors
std::set<std::shared_ptr<node_impl>> UniqueDeps;
for (auto &Arg : Args) {
if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) {
continue;
}
// Look through the graph for nodes which share this argument
for (auto nodePtr : MRoots) {
check_for_arg(arg, nodePtr, uniqueDeps, true);
for (auto NodePtr : MRoots) {
check_for_arg(Arg, NodePtr, UniqueDeps);
}
}

// Add any deps determined from arguments into the dependency list
deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end());
if (!deps.empty()) {
for (auto n : deps) {
n->register_successor(nodeImpl); // register successor
this->remove_root(nodeImpl); // remove receiver from root node
// Add any deps determined from accessor arguments into the dependency list
Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
if (!Deps.empty()) {
for (auto N : Deps) {
N->register_successor(NodeImpl); // register successor
this->remove_root(NodeImpl); // remove receiver from root node
// list
}
} else {
this->add_root(nodeImpl);
this->add_root(NodeImpl);
}
return nodeImpl;
return NodeImpl;
}

bool graph_impl::clear_queues() {
bool anyQueuesCleared = false;
for (auto &q : MRecordingQueues) {
q->setCommandGraph(nullptr);
anyQueuesCleared = true;
bool AnyQueuesCleared = false;
for (auto &Queue : MRecordingQueues) {
Queue->setCommandGraph(nullptr);
AnyQueuesCleared = true;
}
MRecordingQueues.clear();

return anyQueuesCleared;
return AnyQueuesCleared;
}

void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
Expand All @@ -139,7 +158,33 @@ void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
for (auto Sender : MPredecessors)
Deps.push_back(Sender->get_event());

MEvent = Queue->submit(wrapper{MBody, Deps}, Queue _CODELOCFW(CodeLoc));
// Enqueue kernel here instead of submit

std::vector<pi_event> RawEvents;
pi_event *OutEvent = nullptr;
auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
NewEvent->setContextImpl(Queue->getContextImplPtr());
NewEvent->setStateIncomplete();
OutEvent = &NewEvent->getHandleRef();
pi_result Res =
Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
OutEvent);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::event,
"Failed to create event for node submission");
}

pi_int32 Result = enqueueImpKernel(
Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel,
MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr);
if (Result != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel");
}
sycl::event QueueEvent =
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
Queue->addEvent(QueueEvent);
MEvent = QueueEvent;
}
} // namespace detail

Expand Down
82 changes: 63 additions & 19 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
#pragma once

#include <sycl/detail/cg_types.hpp>
#include <sycl/detail/os_util.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/handler.hpp>

#include <detail/kernel_impl.hpp>

#include <cstring>
#include <functional>
#include <list>
#include <set>
Expand Down Expand Up @@ -48,9 +52,24 @@ struct node_impl {
std::vector<std::shared_ptr<node_impl>> MSuccessors;
std::vector<std::shared_ptr<node_impl>> MPredecessors;

std::function<void(sycl::handler &)> MBody;

/// Kernel to be executed by this node
std::shared_ptr<sycl::detail::kernel_impl> MKernel;
/// Description of the kernel global and local sizes as well as offset
sycl::detail::NDRDescT MNDRDesc;
/// Module handle for the kernel to be executed.
sycl::detail::OSModuleHandle MOSModuleHandle =
sycl::detail::OSUtil::ExeModuleHandle;
/// Kernel name inside the module
std::string MKernelName;
std::vector<sycl::detail::AccessorImplPtr> MAccStorage;
std::vector<sycl::detail::LocalAccessorImplPtr> MLocalAccStorage;
std::vector<sycl::detail::AccessorImplHost *> MRequirements;

/// Store arg descriptors for the kernel arguments
std::vector<sycl::detail::ArgDesc> MArgs;
// We need to store local copies of the values pointed to by MArgssince they
// may go out of scope before execution.
std::vector<std::vector<std::byte>> MArgStorage;

void exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
_CODELOCPARAM(&CodeLoc));
Expand All @@ -66,17 +85,30 @@ struct node_impl {

sycl::event get_event(void) const { return MEvent; }

template <typename T>
node_impl(const std::shared_ptr<graph_impl> &Graph, T CGF,
const std::vector<sycl::detail::ArgDesc> &Args)
: MScheduled(false), MGraph(Graph), MBody(CGF), MArgs(Args) {
node_impl(
const std::shared_ptr<graph_impl> &Graph,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &args)
: MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
MOSModuleHandle(OSModuleHandle), MKernelName(KernelName),
MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage),
MRequirements(Requirements), MArgs(args), MArgStorage() {

// Need to copy the arg values to node local storage so that they don't go
// out of scope before execution
for (size_t i = 0; i < MArgs.size(); i++) {
if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) {
// Make sure we are storing the actual USM pointer for comparison
// purposes, note we couldn't actually submit using these copies of the
// args if subsequent code expects a void**.
MArgs[i].MPtr = *(void **)(MArgs[i].MPtr);
}
auto &CurrentArg = MArgs[i];
MArgStorage.emplace_back(CurrentArg.MSize);
auto StoragePtr = MArgStorage.back().data();
if (CurrentArg.MPtr)
std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize);
// Set the arg descriptor to point to the new storage
CurrentArg.MPtr = StoragePtr;
}
}

Expand All @@ -90,13 +122,14 @@ struct node_impl {
Schedule.push_front(std::shared_ptr<node_impl>(this));
}

bool has_arg(const sycl::detail::ArgDesc &Arg, bool DereferencePtr = false) {
bool has_arg(const sycl::detail::ArgDesc &Arg) {
for (auto &NodeArg : MArgs) {
if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) {
// Args coming directly from the handler will need to be dereferenced
// since they are actually void**
void *IncomingPtr = DereferencePtr ? *(void **)Arg.MPtr : Arg.MPtr;
if (IncomingPtr == NodeArg.MPtr) {
// Args are actually void** so we need to dereference them to compare
// actual values
void *IncomingPtr = *static_cast<void **>(Arg.MPtr);
void *ArgPtr = *static_cast<void **>(NodeArg.MPtr);
if (IncomingPtr == ArgPtr) {
return true;
}
}
Expand All @@ -119,9 +152,20 @@ struct graph_impl {
void add_root(const std::shared_ptr<node_impl> &);
void remove_root(const std::shared_ptr<node_impl> &);

template <typename T>
std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl, T CGF,
add(const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class handler_impl {
: MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)),
MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){};

handler_impl() = default;

void setStateExplicitKernelBundle() {
if (MSubmissionState == HandlerSubmissionState::SPEC_CONST_SET_STATE)
throw sycl::exception(
Expand Down
Loading