Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,57 +13,37 @@ KERNEL(calc_mean_sqr_mean_per_feature)(
__global ACCUMULATOR_TYPE* internal_mean,
__global ACCUMULATOR_TYPE* internal_variance
) {
const uint data_set_idx = get_global_id(1); // batch * feature split
const uint in_data_set_idx = get_global_id(0);
const uint workers_per_dataset = LWS0 / FSV; // 16 datasets are handled by one local workgroup
const uint data_set_size = INPUT0_SIZE_X * INPUT0_SIZE_Y;
const uint items_num = data_set_size / workers_per_dataset;
const uint leftovers = data_set_size - (items_num * workers_per_dataset);
const uint b = get_global_id(2) / INPUT0_FEATURE_NUM;
const uint f = get_global_id(2) % INPUT0_FEATURE_NUM;
const uint y = get_global_id(1);
const uint x = get_global_id(0);
const uint divisor_x = INPUT0_SIZE_X / get_local_size(0);
const uint divisor_y = INPUT0_SIZE_Y / get_local_size(1);

const uint INPUT0_ALIGNED_FEATURE_NUM = ALIGN(INPUT0_FEATURE_NUM, FSV);
const uint b = (data_set_idx * FSV) / INPUT0_ALIGNED_FEATURE_NUM;
const uint f_base = (data_set_idx * FSV) % INPUT0_ALIGNED_FEATURE_NUM;
const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0);
const uint my_data_offset = data_set_offset + in_data_set_idx;

__local ACCUMULATOR_TYPE sum_per_feature[SLM_SIZE];
__local ACCUMULATOR_TYPE sqr_sum_per_feature[SLM_SIZE];

ACCUMULATOR_TYPE sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE sqr_sum = ACCUMULATOR_VAL_ZERO;

for (uint i = 0; i < items_num; ++i) {
ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
sum += data;
sqr_sum += data * data;
}

if (in_data_set_idx < leftovers) {
ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
sum += data;
sqr_sum += data * data;
}

sum_per_feature[in_data_set_idx] = sum;
sqr_sum_per_feature[in_data_set_idx] = sqr_sum;
const uint num_local_workers = LWS0;
const uint worker_block_idx = in_data_set_idx / FSV;
uint reduce_add_level = 1;
while ((SLM_SIZE / FSV) > reduce_add_level) {
barrier(CLK_LOCAL_MEM_FENCE);
if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + FSV * reduce_add_level) < num_local_workers) {
sum_per_feature[in_data_set_idx] += sum_per_feature[in_data_set_idx + FSV * reduce_add_level];
sqr_sum_per_feature[in_data_set_idx] += sqr_sum_per_feature[in_data_set_idx + FSV * reduce_add_level];
ACCUMULATOR_TYPE local_sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE local_sqr_sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE wi_sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE wi_sqr_sum = ACCUMULATOR_VAL_ZERO;
unroll_for (uint i = 0; i < divisor_y; ++i) {
unroll_for (uint j = 0; j < divisor_x; ++j) {
const uint data_offset = INPUT0_GET_INDEX(b, f, y + (get_local_size(1) * i), x + (get_local_size(0) * j));
ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[data_offset]);
wi_sum += data;
wi_sqr_sum += data * data;
}
reduce_add_level *= 2;
}

if (worker_block_idx == 0 && (f_base + in_data_set_idx) < INPUT0_FEATURE_NUM) {
ACCUMULATOR_TYPE mean = sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
ACCUMULATOR_TYPE variance = sqr_sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
uint bf = b * INPUT0_FEATURE_NUM + f_base + in_data_set_idx;
internal_mean[bf] = mean;
internal_variance[bf] = variance;
local_sum += work_group_reduce_add(wi_sum);
local_sqr_sum += work_group_reduce_add(wi_sqr_sum);

uint bf = b * INPUT0_FEATURE_NUM + f;
if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0) {
uint group_size = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);
uint group_wi_size = INPUT0_SIZE_X * INPUT0_SIZE_Y;
float mean = local_sum / TO_ACCUMULATOR_TYPE(group_wi_size);
float variance = local_sqr_sum / TO_ACCUMULATOR_TYPE(group_wi_size);
internal_mean[b * INPUT0_FEATURE_NUM + f] = mean;
internal_variance[b * INPUT0_FEATURE_NUM + f] = variance;
}
}
#elif GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,44 @@ class GroupNormalizationGeneratorCalcSQRMean : public GroupNormalizationGenerato
return args;
}

static std::vector<size_t> getDivisors(size_t n) {
std::vector<size_t> divisors;
for (size_t i = 1; i <= n; ++i) {
if (n % i == 0) {
divisors.push_back(i);
}
}
return divisors;
}

static std::pair<size_t, size_t> adjustWorkGroupSize(size_t x, size_t y, size_t max_work_group_size) {
if (x * y <= max_work_group_size) {
return {x, y};
}

auto x_divisors = getDivisors(x);
auto y_divisors = getDivisors(y);

size_t best_x = 1, best_y = 1;
size_t max_area = 0;

for (auto dx : x_divisors) {
size_t new_x = x / dx;
for (auto dy : y_divisors) {
size_t new_y = y / dy;
size_t area = new_x * new_y;
if (area <= max_work_group_size && area > max_area) {
best_x = new_x;
best_y = new_y;
max_area = area;
}
}
}

return {best_x, best_y};
}


[[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override {
return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {
assert(!params.is_dynamic());
Expand All @@ -114,25 +152,16 @@ class GroupNormalizationGeneratorCalcSQRMean : public GroupNormalizationGenerato
auto f = extract_channel(ChannelName::FEATURE, ol);
auto b = extract_channel(ChannelName::BATCH, ol);

wgs.global[0] = x * y;
wgs.global[1] = ceil_div(f, fsv) * b;
wgs.global[2] = 1;

wgs.local[0] = x * y;
wgs.local[1] = 1;
wgs.local[2] = 1;
size_t max_wgs = params.get_device_info().max_work_group_size;
auto [wgs0, wgs1] = adjustWorkGroupSize(x, y, max_wgs);

auto max_wgs = params.get_device_info().max_work_group_size;
wgs.global[0] = wgs0;
wgs.global[1] = wgs1;
wgs.global[2] = b * f;

size_t divisor = 2;
while (wgs.local[0] > (max_wgs / fsv)) {
if (wgs.global[0] % divisor == 0) {
wgs.local[0] = wgs.global[0] / divisor;
}
divisor += 1;
}
wgs.local[0] *= fsv;
wgs.global[0] = wgs.local[0];
wgs.local[0] = wgs0;
wgs.local[1] = wgs1;
wgs.local[2] = 1;
}};
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,6 @@ struct GroupNormalizationFsv16Opt : public GroupNormalizationBase {
}

if (in0_layout.is_static() && out_layout.is_static()) {
// no support for spatial paddings in static case
if (in0_layout.data_padding._lower_size[3] > 0 || in0_layout.data_padding._lower_size[2] > 0 || in0_layout.data_padding._upper_size[3] > 0 ||
in0_layout.data_padding._upper_size[2] > 0) {
return false;
}

if (!fused_ops_are_one_of<eltwise, activation, reorder>(node.get_fused_primitives())) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ typedef std::tuple<
std::size_t, // Number of groups
double, // Epsilon
format, // First input layout
padding, // First input padding
format, // Output layout
padding // Output padding
>
Expand All @@ -38,10 +39,11 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza

void SetUp() override {
const auto& params = GetParam();
const auto& [input_shape, _num_groups_, _epsilon_, _in_format_, _out_format_, _output_pad_] = params;
const auto& [input_shape, _num_groups_, _epsilon_, _in_format_, _in_pad_, _out_format_, _output_pad_] = params;
num_groups_ = _num_groups_;
epsilon_ = _epsilon_;
in_format_ = _in_format_;
in_pad_ = _in_pad_;
out_format_ = _out_format_;
output_pad_ = _output_pad_;
std::copy(std::begin(input_shape), std::end(input_shape), std::back_inserter(data_shape_));
Expand Down Expand Up @@ -71,7 +73,8 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
}
tp.add(input_layout{scale_primitive_, scale_bias_layout_});
tp.add(input_layout{bias_primitive_, scale_bias_layout_});
tp.add(reorder{reordered_data_primitive, data_primitive_, in_format_, data_types::f32});
// tp.add(reorder{reordered_data_primitive, data_primitive_, in_format_, data_types::f32});
tp.add(reorder{reordered_data_primitive, data_primitive_, layout{input_shape, data_types::f32, in_format_, in_pad_}});

auto g = group_normalization{
"group_normalization_output",
Expand Down Expand Up @@ -131,6 +134,7 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
std::size_t num_groups_{};
double epsilon_{};
format in_format_{format::any};
padding in_pad_{padding()};
format out_format_{format::any};
padding output_pad_{padding()};
network::ptr network_{};
Expand Down Expand Up @@ -175,6 +179,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::ValuesIn(std::vector<size_t>{1, 4}),
::testing::Values(0.0025),
::testing::ValuesIn(f_planar_4d_formats),
::testing::ValuesIn({padding()}),
::testing::ValuesIn(f_4d_formats),
::testing::ValuesIn({padding(), padding({0, 0, 1, 1})})));

Expand All @@ -186,6 +191,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::ValuesIn(std::vector<size_t>{1, 2, 4}),
::testing::Values(0.0025),
::testing::ValuesIn(f_blocked_4d_formats),
::testing::ValuesIn({padding(), padding({0, 0, 1, 1})}),
::testing::ValuesIn(f_4d_formats),
::testing::ValuesIn({padding(), padding({0, 16, 0, 0})})));

Expand All @@ -197,6 +203,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::ValuesIn(std::vector<size_t>{1, 4}),
::testing::Values(0.0025),
::testing::ValuesIn(f_planar_5d_formats),
::testing::ValuesIn({padding()}),
::testing::ValuesIn(f_planar_5d_formats),
::testing::ValuesIn({padding(), padding({0, 0, 1, 1})})));

Expand Down
Loading