@@ -6064,6 +6064,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6064
6064
// 1GB array
6065
6065
const size_t size = arr*1e6 ;
6066
6066
6067
+ double sum = 0.0 ;
6068
+
6069
+ // heat-up
6070
+ {
6071
+ char * src = (char *) malloc (size);
6072
+ char * dst = (char *) malloc (size);
6073
+
6074
+ for (size_t i = 0 ; i < size; i++) src[i] = i;
6075
+
6076
+ memcpy (dst, src, size); // heat-up
6077
+
6078
+ double tsum = 0.0 ;
6079
+
6080
+ for (size_t i = 0 ; i < n; i++) {
6081
+ const int64_t t0 = ggml_time_us ();
6082
+
6083
+ memcpy (dst, src, size);
6084
+
6085
+ const int64_t t1 = ggml_time_us ();
6086
+
6087
+ tsum += (t1 - t0)*1e-6 ;
6088
+
6089
+ src[rand () % size] = rand () % 256 ;
6090
+ }
6091
+
6092
+ snprintf (strbuf, sizeof (strbuf), " memcpy: %7.2f GB/s (heat-up)\n " , (double ) (n*size)/(tsum*1e9 ));
6093
+ s += strbuf;
6094
+
6095
+ // needed to prevent the compiler from optimizing the memcpy away
6096
+ {
6097
+ for (size_t i = 0 ; i < size; i++) sum += dst[i];
6098
+ }
6099
+
6100
+ free (src);
6101
+ free (dst);
6102
+ }
6103
+
6067
6104
// single-thread
6068
6105
{
6069
6106
char * src = (char *) malloc (size);
@@ -6074,7 +6111,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6074
6111
memcpy (dst, src, size); // heat-up
6075
6112
6076
6113
double tsum = 0.0 ;
6077
- double sum = 0.0 ;
6078
6114
6079
6115
for (size_t i = 0 ; i < n; i++) {
6080
6116
const int64_t t0 = ggml_time_us ();
@@ -6088,21 +6124,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6088
6124
src[rand () % size] = rand () % 256 ;
6089
6125
}
6090
6126
6091
- snprintf (strbuf, sizeof (strbuf), " memcpy: %.2f GB/s (1 thread)\n " , (double ) (n*size)/(tsum*1e9 ));
6127
+ snprintf (strbuf, sizeof (strbuf), " memcpy: %7 .2f GB/s ( 1 thread)\n " , (double ) (n*size)/(tsum*1e9 ));
6092
6128
s += strbuf;
6093
6129
6094
6130
// needed to prevent the compiler from optimizing the memcpy away
6095
6131
{
6096
6132
for (size_t i = 0 ; i < size; i++) sum += dst[i];
6133
+ }
6134
+
6135
+ free (src);
6136
+ free (dst);
6137
+ }
6138
+
6139
+ // multi-thread
6140
+
6141
+ for (uint32_t n_threads = 1 ; n_threads <= std::thread::hardware_concurrency (); n_threads++) {
6142
+ char * src = (char *) malloc (size);
6143
+ char * dst = (char *) malloc (size);
6144
+
6145
+ for (size_t i = 0 ; i < size; i++) src[i] = i;
6146
+
6147
+ memcpy (dst, src, size); // heat-up
6148
+
6149
+ double tsum = 0.0 ;
6150
+
6151
+ auto helper = [&](int th) {
6152
+ const int64_t i0 = (th + 0 )*size/n_threads;
6153
+ const int64_t i1 = (th + 1 )*size/n_threads;
6154
+
6155
+ for (size_t i = 0 ; i < n; i++) {
6156
+ memcpy (dst + i0, src + i0, i1 - i0);
6097
6157
6098
- snprintf (strbuf, sizeof (strbuf), " sum: %f\n " , sum);
6099
- s += strbuf;
6158
+ src[i0 + rand () % (i1 - i0)] = rand () % 256 ;
6159
+ };
6160
+ };
6161
+
6162
+ const int64_t t0 = ggml_time_us ();
6163
+
6164
+ std::vector<std::thread> threads (n_threads - 1 );
6165
+ for (uint32_t th = 0 ; th < n_threads - 1 ; ++th) {
6166
+ threads[th] = std::thread (helper, th);
6167
+ }
6168
+
6169
+ helper (n_threads - 1 );
6170
+
6171
+ for (uint32_t th = 0 ; th < n_threads - 1 ; ++th) {
6172
+ threads[th].join ();
6173
+ }
6174
+
6175
+ const int64_t t1 = ggml_time_us ();
6176
+
6177
+ tsum += (t1 - t0)*1e-6 ;
6178
+
6179
+ snprintf (strbuf, sizeof (strbuf), " memcpy: %7.2f GB/s (%2d thread)\n " , (double ) (n*size)/(tsum*1e9 ), n_threads);
6180
+ s += strbuf;
6181
+
6182
+ // needed to prevent the compiler from optimizing the memcpy away
6183
+ {
6184
+ for (size_t i = 0 ; i < size; i++) sum += dst[i];
6100
6185
}
6101
6186
6102
6187
free (src);
6103
6188
free (dst);
6104
6189
}
6105
6190
6191
+ snprintf (strbuf, sizeof (strbuf), " sum: %f\n " , sum);
6192
+ s += strbuf;
6193
+
6106
6194
return s.c_str ();
6107
6195
}
6108
6196
0 commit comments