Skip to content

Commit 3db0b7e

Browse files
committed
musa: update compile flags
Signed-off-by: Xiaodong Ye <[email protected]>
1 parent a86a580 commit 3db0b7e

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

ggml/src/ggml-cuda/topk-moe.cu

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
3333
float logits_r[experts_per_thread];
3434

3535
#pragma unroll
36-
for (int i = 0; i < n_experts; i += WARP_SIZE) {
36+
for (int i = 0; i < static_cast<int>(n_experts); i += WARP_SIZE) {
3737
const int expert = i + threadIdx.x;
38-
logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[expert] : -INFINITY;
38+
logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < static_cast<int>(n_experts) ? logits[expert] : -INFINITY;
3939
}
4040

4141
float max_val = logits_r[0];
@@ -83,7 +83,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
8383
#pragma unroll
8484
for (int i = 1; i < experts_per_thread; i++) {
8585
const int expert = threadIdx.x + i * WARP_SIZE;
86-
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
86+
if ((n_experts % WARP_SIZE == 0 || expert < static_cast<int>(n_experts)) && wt[i] > max_val) {
8787
max_val = wt[i];
8888
max_expert = expert;
8989
}
@@ -204,8 +204,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
204204

205205
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
206206

207-
cudaStream_t stream = ctx.stream();
208-
209207
const int n_expert_used = weights->ne[1];
210208

211209
if (with_norm) {

ggml/src/ggml-musa/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ if (MUSAToolkit_FOUND)
5656

5757
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
5858
foreach(SOURCE ${GGML_SOURCES_MUSA})
59-
set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
59+
set(COMPILE_FLAGS "-Od3 -fno-strict-aliasing -ffast-math -fsigned-char -x musa -mtgpu -fmusa-flush-denormals-to-zero")
6060
foreach(ARCH ${MUSA_ARCHITECTURES})
6161
set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
6262
endforeach()

0 commit comments

Comments
 (0)