Skip to content

Commit

Permalink
[SYCL] Remove CGF reuse in graph nodes
Browse files Browse the repository at this point in the history
- Note reductions are broken by this commit due to missing accessor support

## Removes use of the CGF when submitting graph nodes.
- Handler info is extracted and copied into nodes
- Adding nodes in record and replay moved to finalize.
- Workarounds for reduction wg sizes added.
- Introduce `graph-record-temp-scope.cpp` test case which fails before this commit and passes afterwards.

## [SYCL] Stop USM arguments creating edges in the explicit API 

Instead of USM arguments, it is buffer accessors that should be used for
edge detection. Fixes `graph-explicit-node-ordering.cpp` test ordering which is currently
creating incorrect extra edges

Also added `graph-explicit-dotp-buffer.cpp` test for explicit API with accessor edges, we can use to see if this
logic works once accessors are better supported.

## [SYCL] New sycl::handler constructor for a graph

This change adds a new handler constructor which takes
a graph, rather than creating a default temporary queue object
to pass to the existing constructor.

Co-authored-by: Ewan Crawford <[email protected]>
  • Loading branch information
Bensuo and EwanC authored Mar 23, 2023
1 parent b7f17c8 commit 1efde99
Show file tree
Hide file tree
Showing 12 changed files with 402 additions and 91 deletions.
16 changes: 16 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
class handler;
template <typename T, int Dimensions, typename AllocatorT, typename Enable>
class buffer;

namespace ext::oneapi::experimental::detail {
class graph_impl;
}
namespace detail {

class handler_impl;
Expand Down Expand Up @@ -372,6 +376,14 @@ class __SYCL_EXPORT handler {
std::shared_ptr<detail::queue_impl> PrimaryQueue,
std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);

/// Constructs SYCL handler from Graph.
///
/// The hander will add the command-group as a node to the graph rather than
/// enqueueing it straight away.
///
/// \param Graph is a SYCL command_graph
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);

/// Stores copy of Arg passed to the MArgsStorage.
template <typename T, typename F = typename detail::remove_const_t<
typename detail::remove_reference_t<T>>>
Expand Down Expand Up @@ -2528,6 +2540,8 @@ class __SYCL_EXPORT handler {
private:
std::shared_ptr<detail::handler_impl> MImpl;
std::shared_ptr<detail::queue_impl> MQueue;
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;

/// The storage for the arguments passed.
/// We need to store a copy of values that are passed explicitly through
/// set_arg, require and so on, because we need them to be alive after
Expand Down Expand Up @@ -2611,6 +2625,8 @@ class __SYCL_EXPORT handler {
friend class ::MockHandler;
friend class detail::queue_impl;

friend class ext::oneapi::experimental::detail::graph_impl;

bool DisableRangeRounding();

bool RangeRoundingTrace();
Expand Down
117 changes: 81 additions & 36 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <detail/graph_impl.hpp>
#include <detail/queue_impl.hpp>
#include <detail/scheduler/commands.hpp>
#include <sycl/queue.hpp>

namespace sycl {
Expand Down Expand Up @@ -61,76 +62,94 @@ void graph_impl::remove_root(const std::shared_ptr<node_impl> &Root) {
// Recursive check if a graph node or its successors contains a given kernel
// argument.
//
// @param[in] arg The kernel argument to check for.
// @param[in] currentNode The current graph node being checked.
// @param[in,out] deps The unique list of dependencies which have been
// @param[in] Arg The kernel argument to check for.
// @param[in] CurrentNode The current graph node being checked.
// @param[in,out] Deps The unique list of dependencies which have been
// identified for this arg.
// @param[in] dereferencePtr if true arg comes direct from the handler in which
// case it will need to be deferenced to check actual value.
//
// @returns True if a dependency was added in this node of any of its
// successors.
bool check_for_arg(const sycl::detail::ArgDesc &Arg,
const std::shared_ptr<node_impl> &CurrentNode,
std::set<std::shared_ptr<node_impl>> &Deps,
bool DereferencePtr = false) {
std::set<std::shared_ptr<node_impl>> &Deps) {
bool SuccessorAddedDep = false;
for (auto &Successor : CurrentNode->MSuccessors) {
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps, DereferencePtr);
SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps);
}

if (Deps.find(CurrentNode) == Deps.end() &&
CurrentNode->has_arg(Arg, DereferencePtr) && !SuccessorAddedDep) {
if (Deps.find(CurrentNode) == Deps.end() && CurrentNode->has_arg(Arg) &&
!SuccessorAddedDep) {
Deps.insert(CurrentNode);
return true;
}
return SuccessorAddedDep;
}

template <typename T>
std::shared_ptr<node_impl>
graph_impl::add(const std::shared_ptr<graph_impl> &impl, T cgf,
const std::vector<sycl::detail::ArgDesc> &args,
const std::vector<std::shared_ptr<node_impl>> &dep) {
std::shared_ptr<node_impl> nodeImpl =
std::make_shared<node_impl>(impl, cgf, args);
graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
sycl::handler Handler{Impl};
CGF(Handler);

return this->add(Impl, Handler.MKernel, Handler.MNDRDesc,
Handler.MOSModuleHandle, Handler.MKernelName,
Handler.MAccStorage, Handler.MLocalAccStorage,
Handler.MRequirements, Handler.MArgs, {});
}

std::shared_ptr<node_impl> graph_impl::add(
const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle,
std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep) {
const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>(
Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage,
LocalAccStorage, Requirements, Args);
// Copy deps so we can modify them
auto deps = dep;
auto Deps = Dep;
// A unique set of dependencies obtained by checking kernel arguments
std::set<std::shared_ptr<node_impl>> uniqueDeps;
for (auto &arg : args) {
if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) {
// for accessors
std::set<std::shared_ptr<node_impl>> UniqueDeps;
for (auto &Arg : Args) {
if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) {
continue;
}
// Look through the graph for nodes which share this argument
for (auto nodePtr : MRoots) {
check_for_arg(arg, nodePtr, uniqueDeps, true);
for (auto NodePtr : MRoots) {
check_for_arg(Arg, NodePtr, UniqueDeps);
}
}

// Add any deps determined from arguments into the dependency list
deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end());
if (!deps.empty()) {
for (auto n : deps) {
n->register_successor(nodeImpl); // register successor
this->remove_root(nodeImpl); // remove receiver from root node
// Add any deps determined from accessor arguments into the dependency list
Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
if (!Deps.empty()) {
for (auto N : Deps) {
N->register_successor(NodeImpl); // register successor
this->remove_root(NodeImpl); // remove receiver from root node
// list
}
} else {
this->add_root(nodeImpl);
this->add_root(NodeImpl);
}
return nodeImpl;
return NodeImpl;
}

bool graph_impl::clear_queues() {
bool anyQueuesCleared = false;
for (auto &q : MRecordingQueues) {
q->setCommandGraph(nullptr);
anyQueuesCleared = true;
bool AnyQueuesCleared = false;
for (auto &Queue : MRecordingQueues) {
Queue->setCommandGraph(nullptr);
AnyQueuesCleared = true;
}
MRecordingQueues.clear();

return anyQueuesCleared;
return AnyQueuesCleared;
}

void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
Expand All @@ -139,7 +158,33 @@ void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
for (auto Sender : MPredecessors)
Deps.push_back(Sender->get_event());

MEvent = Queue->submit(wrapper{MBody, Deps}, Queue _CODELOCFW(CodeLoc));
// Enqueue kernel here instead of submit

std::vector<pi_event> RawEvents;
pi_event *OutEvent = nullptr;
auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
NewEvent->setContextImpl(Queue->getContextImplPtr());
NewEvent->setStateIncomplete();
OutEvent = &NewEvent->getHandleRef();
pi_result Res =
Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
OutEvent);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::event,
"Failed to create event for node submission");
}

pi_int32 Result = enqueueImpKernel(
Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel,
MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr);
if (Result != pi_result::PI_SUCCESS) {
throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel");
}
sycl::event QueueEvent =
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
Queue->addEvent(QueueEvent);
MEvent = QueueEvent;
}
} // namespace detail

Expand Down
82 changes: 63 additions & 19 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
#pragma once

#include <sycl/detail/cg_types.hpp>
#include <sycl/detail/os_util.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/handler.hpp>

#include <detail/kernel_impl.hpp>

#include <cstring>
#include <functional>
#include <list>
#include <set>
Expand Down Expand Up @@ -48,9 +52,24 @@ struct node_impl {
std::vector<std::shared_ptr<node_impl>> MSuccessors;
std::vector<std::shared_ptr<node_impl>> MPredecessors;

std::function<void(sycl::handler &)> MBody;

/// Kernel to be executed by this node
std::shared_ptr<sycl::detail::kernel_impl> MKernel;
/// Description of the kernel global and local sizes as well as offset
sycl::detail::NDRDescT MNDRDesc;
/// Module handle for the kernel to be executed.
sycl::detail::OSModuleHandle MOSModuleHandle =
sycl::detail::OSUtil::ExeModuleHandle;
/// Kernel name inside the module
std::string MKernelName;
std::vector<sycl::detail::AccessorImplPtr> MAccStorage;
std::vector<sycl::detail::LocalAccessorImplPtr> MLocalAccStorage;
std::vector<sycl::detail::AccessorImplHost *> MRequirements;

/// Store arg descriptors for the kernel arguments
std::vector<sycl::detail::ArgDesc> MArgs;
// We need to store local copies of the values pointed to by MArgs since they
// may go out of scope before execution.
std::vector<std::vector<std::byte>> MArgStorage;

void exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
_CODELOCPARAM(&CodeLoc));
Expand All @@ -66,17 +85,30 @@ struct node_impl {

sycl::event get_event(void) const { return MEvent; }

template <typename T>
node_impl(const std::shared_ptr<graph_impl> &Graph, T CGF,
const std::vector<sycl::detail::ArgDesc> &Args)
: MScheduled(false), MGraph(Graph), MBody(CGF), MArgs(Args) {
node_impl(
const std::shared_ptr<graph_impl> &Graph,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &args)
: MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
MOSModuleHandle(OSModuleHandle), MKernelName(KernelName),
MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage),
MRequirements(Requirements), MArgs(args), MArgStorage() {

// Need to copy the arg values to node local storage so that they don't go
// out of scope before execution
for (size_t i = 0; i < MArgs.size(); i++) {
if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) {
// Make sure we are storing the actual USM pointer for comparison
// purposes, note we couldn't actually submit using these copies of the
// args if subsequent code expects a void**.
MArgs[i].MPtr = *(void **)(MArgs[i].MPtr);
}
auto &CurrentArg = MArgs[i];
MArgStorage.emplace_back(CurrentArg.MSize);
auto StoragePtr = MArgStorage.back().data();
if (CurrentArg.MPtr)
std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize);
// Set the arg descriptor to point to the new storage
CurrentArg.MPtr = StoragePtr;
}
}

Expand All @@ -90,13 +122,14 @@ struct node_impl {
Schedule.push_front(std::shared_ptr<node_impl>(this));
}

bool has_arg(const sycl::detail::ArgDesc &Arg, bool DereferencePtr = false) {
bool has_arg(const sycl::detail::ArgDesc &Arg) {
for (auto &NodeArg : MArgs) {
if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) {
// Args coming directly from the handler will need to be dereferenced
// since they are actually void**
void *IncomingPtr = DereferencePtr ? *(void **)Arg.MPtr : Arg.MPtr;
if (IncomingPtr == NodeArg.MPtr) {
// Args are actually void** so we need to dereference them to compare
// actual values
void *IncomingPtr = *static_cast<void **>(Arg.MPtr);
void *ArgPtr = *static_cast<void **>(NodeArg.MPtr);
if (IncomingPtr == ArgPtr) {
return true;
}
}
Expand All @@ -119,9 +152,20 @@ struct graph_impl {
void add_root(const std::shared_ptr<node_impl> &);
void remove_root(const std::shared_ptr<node_impl> &);

template <typename T>
std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl, T CGF,
add(const std::shared_ptr<graph_impl> &Impl,
std::shared_ptr<sycl::detail::kernel_impl> Kernel,
sycl::detail::NDRDescT NDRDesc,
sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

std::shared_ptr<node_impl>
add(const std::shared_ptr<graph_impl> &Impl,
std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::shared_ptr<node_impl>> &Dep = {});

Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class handler_impl {
: MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)),
MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){};

handler_impl() = default;

void setStateExplicitKernelBundle() {
if (MSubmissionState == HandlerSubmissionState::SPEC_CONST_SET_STATE)
throw sycl::exception(
Expand Down
Loading

0 comments on commit 1efde99

Please sign in to comment.