Skip to content

Commit

Permalink
[SYCL] Fix reductions not working inside graph
Browse files Browse the repository at this point in the history
* Graph submission now properly creates a host visible event on the command list allowing auxilliary resources to be cleaned up

* executeCommandList slightly modified to block execution only for command lists not allowed to be batched.
  • Loading branch information
Bensuo authored Dec 13, 2022
1 parent 3073cfc commit c99bdca
Showing 1 changed file with 8 additions and 24 deletions.
32 changes: 8 additions & 24 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1583,9 +1583,11 @@ void _pi_queue::CaptureIndirectAccesses() {
pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
bool IsBlocking,
bool OKToBatchCommand) {
// When executing a Graph, defer execution
if( this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) return PI_SUCCESS;

// When executing a Graph, defer execution if this is a command
// which could be batched (i.e. likely a kernel submission)
if (this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION && OKToBatchCommand)
return PI_SUCCESS;

bool UseCopyEngine = CommandList->second.isCopy(this);

// If the current LastCommandEvent is the nullptr, then it means
Expand Down Expand Up @@ -3827,31 +3829,13 @@ pi_result piQueueFinish(pi_queue Queue) {
// so this can be left as a no-op.
pi_result piQueueFlush(pi_queue Queue) {
if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {

pi_command_list_ptr_t CommandList{};
// TODO:
CommandList = Queue->LazyCommandListMap.begin();

auto &ZeCommandQueue = CommandList->second.ZeQueue;
// Scope of the lock must be till the end of the function, otherwise new mem
// allocs can be created between the moment when we made a snapshot and the
// moment when command list is closed and executed. But mutex is locked only
// if indirect access tracking enabled, because std::defer_lock is used.
// unique_lock destructor at the end of the function will unlock the mutex
// if it was locked (which happens only if IndirectAccessTrackingEnabled is
// true).
std::unique_lock<pi_shared_mutex> ContextsLock(
Queue->Device->Platform->ContextsMutex, std::defer_lock);

// Close the command list and have it ready for dispatch.
ZE_CALL(zeCommandListClose, (CommandList->first));

// Offload command list to the GPU for asynchronous execution
auto ZeCommandList = CommandList->first;
auto ZeResult = ZE_CALL_NOCHECK(
zeCommandQueueExecuteCommandLists,
(ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));

Queue->executeCommandList(CommandList, false, false);
}
(void)Queue;
return PI_SUCCESS;
}

Expand Down

0 comments on commit c99bdca

Please sign in to comment.