Skip to content

Commit f0a53ae

Browse files
authored
[μKernels]: bf16 lowering for splat layouts (#1078)
This patch extends the `micro-kernel` lowering to support `bf16-splat` layout using `bf16dp` operation.
1 parent 1d06346 commit f0a53ae

File tree

6 files changed

+1246
-551
lines changed

6 files changed

+1246
-551
lines changed

benchmarks/config/base/base.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,20 @@
7878
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --target-feature=avx2 --registerBlocking=2,32,2'" ],
7979
"extensions": [ "avx2" ]
8080
},
81+
"gemm_bf16_splat_dp2_mlir": {
82+
"type": "IR-GEN",
83+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
84+
"environment": {},
85+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing'" ],
86+
"extensions": [ "avx2" ]
87+
},
88+
"gemm_bf16_splat_dp2_mlir_vector_kernel_avx512": {
89+
"type": "IR-GEN",
90+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
91+
"environment": {},
92+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --vector-to-kernels --registerBlocking=8,32,2'" ],
93+
"extensions": [ "avx512_bf16" ]
94+
},
8195
"gemm_bf16_dp4_mlir": {
8296
"type": "IR-GEN",
8397
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=4" ],
@@ -134,6 +148,20 @@
134148
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --target-feature=avx2 --registerBlocking=2,32,2'" ],
135149
"extensions": [ "avx2" ]
136150
},
151+
"mlp_bf16_splat_dp2_mlir": {
152+
"type": "IR-GEN",
153+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
154+
"environment": {},
155+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing'" ],
156+
"extensions": [ "avx2" ]
157+
},
158+
"mlp_bf16_splat_dp2_mlir_vector_kernel_avx512": {
159+
"type": "IR-GEN",
160+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
161+
"environment": {},
162+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --vector-to-kernels --registerBlocking=8,32,2'" ],
163+
"extensions": [ "avx512_bf16" ]
164+
},
137165
"mlp_bf16_dp4_mlir": {
138166
"type": "IR-GEN",
139167
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=4" ],

benchmarks/config/omp/mlir-bf16.json

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,68 @@
9393
}
9494
}},
9595
{
96+
"gemm_bf16_splat_dp2_mlir": {
97+
"bf16_dp2_3x1024_omp_2_mlir": {
98+
"type": "IR-GEN",
99+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
100+
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
101+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,16'" ],
102+
"extensions": [ "(avx2)" ]
103+
},
104+
"bf16_dp2_3x1024_omp_4_mlir": {
105+
"type": "IR-GEN",
106+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
107+
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
108+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,8'" ],
109+
"extensions": [ "(avx2)" ]
110+
},
111+
"bf16_dp2_3x1024_omp_8_mlir": {
112+
"type": "IR-GEN",
113+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
114+
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
115+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=4,8'" ],
116+
"extensions": [ "(avx2)" ]
117+
},
118+
"bf16_dp2_3x1024_omp_16_mlir": {
119+
"type": "IR-GEN",
120+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
121+
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
122+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=2,8'" ],
123+
"extensions": [ "(avx2)" ]
124+
}
125+
}},
126+
{
127+
"gemm_bf16_splat_dp2_mlir_vector_kernel_avx512": {
128+
"bf16_dp2_3x1024_omp_2_mlir": {
129+
"type": "IR-GEN",
130+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
131+
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
132+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=8,32,2'" ],
133+
"extensions": [ "avx512_bf16" ]
134+
},
135+
"bf16_dp2_3x1024_omp_4_mlir": {
136+
"type": "IR-GEN",
137+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
138+
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
139+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
140+
"extensions": [ "avx512_bf16" ]
141+
},
142+
"bf16_dp2_3x1024_omp_8_mlir": {
143+
"type": "IR-GEN",
144+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
145+
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
146+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
147+
"extensions": [ "avx512_bf16" ]
148+
},
149+
"bf16_dp2_3x1024_omp_16_mlir": {
150+
"type": "IR-GEN",
151+
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
152+
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
153+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
154+
"extensions": [ "avx512_bf16" ]
155+
}
156+
}},
157+
{
96158
"mlp_bf16_dp2_mlir": {
97159
"bf16_dp2_3x1024_omp_2_mlir": {
98160
"type": "IR-GEN",
@@ -186,6 +248,68 @@
186248
}
187249
}},
188250
{
251+
"mlp_bf16_splat_dp2_mlir": {
252+
"bf16_dp2_3x1024_omp_2_mlir": {
253+
"type": "IR-GEN",
254+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
255+
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
256+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,16'" ],
257+
"extensions": [ "(avx2)" ]
258+
},
259+
"bf16_dp2_3x1024_omp_4_mlir": {
260+
"type": "IR-GEN",
261+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
262+
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
263+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,8'" ],
264+
"extensions": [ "(avx2)" ]
265+
},
266+
"bf16_dp2_3x1024_omp_8_mlir": {
267+
"type": "IR-GEN",
268+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
269+
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
270+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=4,8'" ],
271+
"extensions": [ "(avx2)" ]
272+
},
273+
"bf16_dp2_3x1024_omp_16_mlir": {
274+
"type": "IR-GEN",
275+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
276+
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
277+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=2,8'" ],
278+
"extensions": [ "(avx2)" ]
279+
}
280+
}},
281+
{
282+
"mlp_bf16_splat_dp2_mlir_vector_kernel_avx512": {
283+
"bf16_dp2_3x1024_omp_2_mlir": {
284+
"type": "IR-GEN",
285+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
286+
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
287+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=8,32,2'" ],
288+
"extensions": [ "avx512_bf16" ]
289+
},
290+
"bf16_dp2_3x1024_omp_4_mlir": {
291+
"type": "IR-GEN",
292+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
293+
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
294+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
295+
"extensions": [ "avx512_bf16" ]
296+
},
297+
"bf16_dp2_3x1024_omp_8_mlir": {
298+
"type": "IR-GEN",
299+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
300+
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
301+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
302+
"extensions": [ "avx512_bf16" ]
303+
},
304+
"bf16_dp2_3x1024_omp_16_mlir": {
305+
"type": "IR-GEN",
306+
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=0" ],
307+
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
308+
"flags": [ "-n", "100", "-run-args='--disable-vnni-packing --def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=8,32,2'" ],
309+
"extensions": [ "avx512_bf16" ]
310+
}
311+
}},
312+
{
189313
"gemm_bf16_dp4_mlir": {
190314
"bf16_dp4_3x1024_omp_2_mlir": {
191315
"type": "IR-GEN",

0 commit comments

Comments
 (0)