Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/doc/EnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
| SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
| SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |

`(*) Note: Any means this environment variable is effective when set to any non-null value.`

Expand Down
58 changes: 53 additions & 5 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,8 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
}

static const pi_uint32 ZeCommandListBatchSize = [] {
// Default value of 4. This has been seen as a good tradeoff between
// lower overhead of number of enqueue and fence calls, and getting
// commands seen as soon possible (i.e. lazy vs eager submission).
pi_uint32 BatchSizeVal = 4;
// Default value of 0. This specifies to use dynamic batch size adjustment.
pi_uint32 BatchSizeVal = 0;
const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
if (BatchSizeStr) {
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
Expand Down Expand Up @@ -550,6 +548,49 @@ pi_result _pi_device::getAvailableCommandList(
return pi_result;
}

void _pi_queue::adjustBatchSizeForFullBatch() {
// QueueBatchSize of 0 means never allow batching.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I think it can't happen in today's code the QueueBatchSize is equal to 0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is incorrect. In the queue itself, QueueBatchSize of 0 means - No Batching. Because I need to allow QueueBatchSize to be 1 when dynamic batching is being used, because the possibility is that dynamic batching can adjust QueueBatchSize down to as low as 1 if we are still doing too many partial batches. So, in the queue itself, QueueBatchSize == 0 means no batching, QueueBatchSize > 0 is just the current bacthing size, and the queue bool UseDynamicBatching controls whether dynamic batch adjustment ever changes QueueBatchSize up or down.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, in the queue itself, QueueBatchSize == 0 means no batching

I realized this, yeah. But the code (in _pi_queue constructor) seems to never set it to 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On looking at the code in pi_level_zero.hpp, I see that you are correct. Batching turned off is now really
represented as QueueBatchSize == 1 and UseDynamicBatching== false, or to say it another way, batching is turned off by using a fixed batch size of 1. Thank you for pointing that out.

if (QueueBatchSize == 0 || !UseDynamicBatching)
return;

NumTimesClosedFull += 1;

// If the number of times the list has been closed early is low, and
// the number of times it has been closed full is high, then raise
// the batching size slowly. Don't raise it if it is already pretty
// high.
if (NumTimesClosedEarly <= 2 && NumTimesClosedFull > 10) {
if (QueueBatchSize < 16) {
QueueBatchSize = QueueBatchSize + 1;
zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
}
NumTimesClosedEarly = 0;
NumTimesClosedFull = 0;
}
}

void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
// QueueBatchSize of 0 means never allow batching.
if (QueueBatchSize == 0 || !UseDynamicBatching)
return;

NumTimesClosedEarly += 1;

// If we are closing early more than about 3x the number of times
// it is closing full, lower the batch size to the value of the
// current open command list. This is trying to quickly get to a
// batch size that will be able to be closed full at least once
// in a while.
if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) {
QueueBatchSize = PartialBatchSize - 1;
if (QueueBatchSize < 1)
QueueBatchSize = 1;
zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
NumTimesClosedEarly = 0;
NumTimesClosedFull = 0;
}
}

pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
ze_fence_handle_t ZeFence,
bool IsBlocking,
Expand All @@ -572,6 +613,8 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
return PI_SUCCESS;
}

adjustBatchSizeForFullBatch();

this->ZeOpenCommandList = nullptr;
this->ZeOpenCommandListFence = nullptr;
this->ZeOpenCommandListSize = 0;
Expand All @@ -592,7 +635,7 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
}

bool _pi_queue::isBatchingAllowed() {
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
return (this->QueueBatchSize > 0 && ((ZeSerialize & ZeSerializeBlock) == 0));
}

pi_result _pi_queue::executeOpenCommandList() {
Expand All @@ -602,6 +645,8 @@ pi_result _pi_queue::executeOpenCommandList() {
if (OpenList) {
auto OpenListFence = this->ZeOpenCommandListFence;

adjustBatchSizeForPartialBatch(this->ZeOpenCommandListSize);

this->ZeOpenCommandList = nullptr;
this->ZeOpenCommandListFence = nullptr;
this->ZeOpenCommandListSize = 0;
Expand Down Expand Up @@ -1860,6 +1905,9 @@ pi_result piQueueRelease(pi_queue Queue) {
Queue->ZeCommandListFenceMap.clear();
ZE_CALL(zeCommandQueueDestroy(Queue->ZeCommandQueue));
Queue->ZeCommandQueue = nullptr;

zePrint("piQueueRelease NumTimesClosedFull %d, NumTimesClosedEarly %d\n",
Queue->NumTimesClosedFull, Queue->NumTimesClosedEarly);
}
return PI_SUCCESS;
}
Expand Down
29 changes: 27 additions & 2 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,11 +271,15 @@ struct _pi_context : _pi_object {
std::mutex NumEventsLiveInEventPoolMutex;
};

// If doing dynamic batching, start batch size at 2.
const pi_uint32 DynamicBatchStartSize = 2;

struct _pi_queue : _pi_object {
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
pi_device Device, pi_uint32 QueueBatchSize)
pi_device Device, pi_uint32 BatchSize)
: ZeCommandQueue{Queue}, Context{Context}, Device{Device},
QueueBatchSize{QueueBatchSize} {}
QueueBatchSize{BatchSize > 0 ? BatchSize : DynamicBatchStartSize},
UseDynamicBatching{BatchSize == 0} {}

// Level Zero command queue handle.
ze_command_queue_handle_t ZeCommandQueue;
Expand Down Expand Up @@ -310,6 +314,18 @@ struct _pi_queue : _pi_object {
// is thread safe because of the locking of the queue that occurs.
pi_uint32 QueueBatchSize = {0};

// specifies whether this queue will be using dynamic batch size adjustment
// or not. This is set only at queue creation time, and is therefore
// const for the life of the queue.
const bool UseDynamicBatching;

// These two members are used to keep track of how often the
// batching closes and executes a command list before reaching the
// QueueBatchSize limit, versus how often we reach the limit.
// This info might be used to vary the QueueBatchSize value.
pi_uint32 NumTimesClosedEarly = {0};
pi_uint32 NumTimesClosedFull = {0};

// Map of all Command lists created with their associated Fence used for
// tracking when the command list is available for use again.
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
Expand All @@ -318,6 +334,15 @@ struct _pi_queue : _pi_object {
// be batched together.
bool isBatchingAllowed();

// adjust the queue's batch size, knowing that the current command list
// is being closed with a full batch.
void adjustBatchSizeForFullBatch();

// adjust the queue's batch size, knowing that the current command list
// is being closed with only a partial batch of commands. How many commands
// are in this partial closure is passed as the parameter.
void adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize);

// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
// If the reset command list should be made available, then MakeAvailable
// needs to be set to true. The caller must verify that this command list and
Expand Down
3 changes: 0 additions & 3 deletions sycl/test/plugins/level_zero_batch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out

// Default batching should be 4
// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s

// Set batching to 4 explicitly
// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s

Expand Down
219 changes: 219 additions & 0 deletions sycl/test/plugins/level_zero_dynamic_batch_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// REQUIRES: gpu, level_zero

// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out

// Check that dynamic batching increases batch size
// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKDYNUP %s

// level_zero_dynamic_batch_test.cpp
//
// This tests the level zero plugin's kernel dyanmic batch size adjustment
// code.
// It starts out by enqueing 40 kernels before it does a wait, and it does
// this 5 times. That should cause the dynamic batch size adjustment to
// raise the batch size up several times.
//
// Then the test starts enqueueing only 4 kernels before doing a wait, and
// it does that 5 times as well. That should cause the batch size to
// be lowered, just once to be less than 4.
//
// CKDYN: Raising QueueBatchSize to 3
// CKDYN: Raising QueueBatchSize to 4
// CKDYN-NOT: Raising QueueBatchSize
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKDYN: Lowering QueueBatchSize to 3
// CKDYN-NOT: Lowering QueueBatchSize
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass
// CKALL: Test Pass

#include "CL/sycl.hpp"
#include <chrono>
#include <cmath>
#include <iostream>

namespace sycl = cl::sycl;

void validate(uint32_t *result, uint32_t *expect, size_t n) {
int error = 0;
for (int i = 0; i < n; i++) {
if (result[i] != expect[i]) {
error++;
if (error < 10) {
printf("Error: %d, expect: %d\n", result[i], expect[i]);
}
}
}
error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n");
}

int main(int argc, char *argv[]) {
size_t M = 65536;
size_t N = 512 / 4;
size_t AL = M * N * sizeof(uint32_t);

sycl::queue q(sycl::default_selector{});
auto ctx = q.get_context();
auto dev = q.get_device();

uint32_t *Y1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z2 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z3 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z4 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z5 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z6 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z7 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
uint32_t *Z8 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));

for (size_t i = 0; i < M * N; i++) {
Y1[i] = i % 255;
}

memset(Z1, '\0', AL);
memset(Z2, '\0', AL);
memset(Z3, '\0', AL);
memset(Z4, '\0', AL);
memset(Z5, '\0', AL);
memset(Z6, '\0', AL);
memset(Z7, '\0', AL);
memset(Z8, '\0', AL);

for (size_t i = 0; i < 5; i++) {
for (size_t j = 0; j < 5; j++) {
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy1>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z1[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy2>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z2[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy3>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z3[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy4>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z4[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy5>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z5[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy6>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z6[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy7>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z7[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy8>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z8[m * N + n] = Y1[m * N + n];
});
});
}
q.wait();
}

validate(Y1, Z1, M * N);
validate(Y1, Z2, M * N);
validate(Y1, Z3, M * N);
validate(Y1, Z4, M * N);
validate(Y1, Z5, M * N);
validate(Y1, Z6, M * N);
validate(Y1, Z7, M * N);
validate(Y1, Z8, M * N);

for (size_t i = 0; i < 5; i++) {
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy9>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z1[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy10>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z2[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy11>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z3[m * N + n] = Y1[m * N + n];
});
});
q.submit([&](sycl::handler &h) {
h.parallel_for<class u32_copy12>(sycl::range<2>{M, N},
[=](sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z4[m * N + n] = Y1[m * N + n];
});
});
q.wait();
}
validate(Y1, Z1, M * N);
validate(Y1, Z2, M * N);
validate(Y1, Z3, M * N);
validate(Y1, Z4, M * N);

sycl::free(Y1, ctx);
sycl::free(Z1, ctx);
sycl::free(Z2, ctx);
sycl::free(Z3, ctx);
sycl::free(Z4, ctx);
sycl::free(Z5, ctx);
sycl::free(Z6, ctx);
sycl::free(Z7, ctx);
sycl::free(Z8, ctx);

return 0;
}