Skip to content

Commit 9befab5

Browse files
authored
bench : multi-thread memcpy (#1534)
1 parent 9ac88f2 commit 9befab5

File tree

1 file changed

+92
-4
lines changed

1 file changed

+92
-4
lines changed

whisper.cpp

Lines changed: 92 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6064,6 +6064,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
60646064
// 1GB array
60656065
const size_t size = arr*1e6;
60666066

6067+
double sum = 0.0;
6068+
6069+
// heat-up
6070+
{
6071+
char * src = (char *) malloc(size);
6072+
char * dst = (char *) malloc(size);
6073+
6074+
for (size_t i = 0; i < size; i++) src[i] = i;
6075+
6076+
memcpy(dst, src, size); // heat-up
6077+
6078+
double tsum = 0.0;
6079+
6080+
for (size_t i = 0; i < n; i++) {
6081+
const int64_t t0 = ggml_time_us();
6082+
6083+
memcpy(dst, src, size);
6084+
6085+
const int64_t t1 = ggml_time_us();
6086+
6087+
tsum += (t1 - t0)*1e-6;
6088+
6089+
src[rand() % size] = rand() % 256;
6090+
}
6091+
6092+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
6093+
s += strbuf;
6094+
6095+
// needed to prevent the compiler from optimizing the memcpy away
6096+
{
6097+
for (size_t i = 0; i < size; i++) sum += dst[i];
6098+
}
6099+
6100+
free(src);
6101+
free(dst);
6102+
}
6103+
60676104
// single-thread
60686105
{
60696106
char * src = (char *) malloc(size);
@@ -6074,7 +6111,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
60746111
memcpy(dst, src, size); // heat-up
60756112

60766113
double tsum = 0.0;
6077-
double sum = 0.0;
60786114

60796115
for (size_t i = 0; i < n; i++) {
60806116
const int64_t t0 = ggml_time_us();
@@ -6088,21 +6124,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
60886124
src[rand() % size] = rand() % 256;
60896125
}
60906126

6091-
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
6127+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
60926128
s += strbuf;
60936129

60946130
// needed to prevent the compiler from optimizing the memcpy away
60956131
{
60966132
for (size_t i = 0; i < size; i++) sum += dst[i];
6133+
}
6134+
6135+
free(src);
6136+
free(dst);
6137+
}
6138+
6139+
// multi-thread
6140+
6141+
for (uint32_t n_threads = 1; n_threads <= std::thread::hardware_concurrency(); n_threads++) {
6142+
char * src = (char *) malloc(size);
6143+
char * dst = (char *) malloc(size);
6144+
6145+
for (size_t i = 0; i < size; i++) src[i] = i;
6146+
6147+
memcpy(dst, src, size); // heat-up
6148+
6149+
double tsum = 0.0;
6150+
6151+
auto helper = [&](int th) {
6152+
const int64_t i0 = (th + 0)*size/n_threads;
6153+
const int64_t i1 = (th + 1)*size/n_threads;
6154+
6155+
for (size_t i = 0; i < n; i++) {
6156+
memcpy(dst + i0, src + i0, i1 - i0);
60976157

6098-
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
6099-
s += strbuf;
6158+
src[i0 + rand() % (i1 - i0)] = rand() % 256;
6159+
};
6160+
};
6161+
6162+
const int64_t t0 = ggml_time_us();
6163+
6164+
std::vector<std::thread> threads(n_threads - 1);
6165+
for (uint32_t th = 0; th < n_threads - 1; ++th) {
6166+
threads[th] = std::thread(helper, th);
6167+
}
6168+
6169+
helper(n_threads - 1);
6170+
6171+
for (uint32_t th = 0; th < n_threads - 1; ++th) {
6172+
threads[th].join();
6173+
}
6174+
6175+
const int64_t t1 = ggml_time_us();
6176+
6177+
tsum += (t1 - t0)*1e-6;
6178+
6179+
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), n_threads);
6180+
s += strbuf;
6181+
6182+
// needed to prevent the compiler from optimizing the memcpy away
6183+
{
6184+
for (size_t i = 0; i < size; i++) sum += dst[i];
61006185
}
61016186

61026187
free(src);
61036188
free(dst);
61046189
}
61056190

6191+
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
6192+
s += strbuf;
6193+
61066194
return s.c_str();
61076195
}
61086196

0 commit comments

Comments
 (0)