Skip to content

Commit

Permalink
Elementwise inner-block size (#876)
Browse files Browse the repository at this point in the history
Compute the inner-block size of elementwise ops based on the register size.
  • Loading branch information
antonio-cortes-perez committed Sep 13, 2024
1 parent 3ffa8bf commit d693236
Show file tree
Hide file tree
Showing 7 changed files with 401 additions and 530 deletions.
3 changes: 1 addition & 2 deletions lib/Dialect/XeTile/Transforms/Blocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,7 @@ getInnerBlockSizes(mlir::Operation *operation, mlir::Type elemTy, int height,
}

if (op == OpType::Elementwise) {
// TODO: get from uArch?
int64_t subgroupSize = 16;
int64_t subgroupSize = uArchInterface->getOneGRFSizeBits() / elementSize;

maxHeight = 1;
minHeight = 1;
Expand Down
3 changes: 1 addition & 2 deletions test/Conversion/XeTileToXeGPU/gemm_preop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ module attributes {gpu.container_module} {
%28 = xetile.load_tile %arg6 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xf16> -> vector<32x32xf16>
xegpu.compile_hint
//CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
//CHECK-COUNT-64: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf16> to vector<1x16xf16>
//CHECK-COUNT-64: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<1x16xf16>
//CHECK-COUNT-32: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<1x32xf16>
%29 = arith.addf %27, %27 : vector<32x32xf16>
xegpu.compile_hint
%30 = xetile.update_tile_offset %arg5, [%c0, %c32] : !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
Expand Down
10 changes: 5 additions & 5 deletions test/Conversion/XeTileToXeGPU/non_pow2_stacking.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ module @test_module attributes {gpu.container_module} {
%c_value = arith.addf %a_valuee, %b_valuee : vector<24x32xf16>

// not a power-of-two (size is 6) inner block stacking reduction
// CHECK: %[[STACK1:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
// CHECK: %[[STACK2:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
// CHECK: %[[STACK3:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
// CHECK: %[[STACK4:.*]] = vector.shuffle %[[STACK1]], %[[STACK2]] [0, 1, 2, 3] : vector<2x16xf16>, vector<2x16xf16>
// CHECK: %[[STACK5:.*]] = vector.shuffle %[[STACK4]], %[[STACK3]] [0, 1, 2, 3, 4, 5] : vector<4x16xf16>, vector<2x16xf16>
// CHECK: %[[STACK1:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
// CHECK: %[[STACK2:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
// CHECK: %[[STACK3:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
// CHECK: %[[STACK4:.*]] = vector.shuffle %[[STACK1]], %[[STACK2]] [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
// CHECK: %[[STACK5:.*]] = vector.shuffle %[[STACK4]], %[[STACK3]] [0, 1, 2, 3, 4, 5] : vector<4x32xf16>, vector<2x32xf16>

%c_valuee = xetile.tile_pack %c_value { inner_blocks = [6, 16] } : vector<24x32xf16> -> vector<4x2x6x16xf16>
xetile.store_tile %c_valuee, %b_tile : vector<4x2x6x16xf16>, !xetile.tile<24x32xf16, #xetile.tile_attr<inner_blocks = [6, 16]>>
Expand Down
356 changes: 114 additions & 242 deletions test/Conversion/XeTileToXeGPU/reduction.mlir

Large diffs are not rendered by default.

483 changes: 242 additions & 241 deletions test/Conversion/XeTileToXeGPU/sg_softmax.mlir

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions test/Conversion/XeTileToXeGPU/test_blocking.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ gpu.module @test_kernel {

// CHECK-LABEL: test_blocking_elementwise
// CHECK-SAME: (%[[A_ORIG:.*]]: vector<64x64xf16>, %[[B_ORIG:.*]]: vector<64x64xf16>)
// CHECK: %[[A1:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
// CHECK: %[[B1:.*]] = xetile.tile_pack %[[B_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
// CHECK: %[[RES1:.*]] = arith.addf %[[A1]], %[[B1]] : vector<64x4x1x16xf16>
// CHECK: %[[RES_UNP1:.*]] = xetile.tile_unpack %[[RES1]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
// CHECK: %[[A2:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
// CHECK: %[[RES2:.*]] = arith.negf %[[A2]] : vector<64x4x1x16xf16>
// CHECK: %[[RES_UNP2:.*]] = xetile.tile_unpack %[[RES2]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
// CHECK: %[[A3:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
// CHECK: %[[RES3:.*]] = math.exp %[[A3]] : vector<64x4x1x16xf16>
// CHECK: %[[RES_UNP3:.*]] = xetile.tile_unpack %[[RES3]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
// CHECK: %[[A1:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
// CHECK: %[[B1:.*]] = xetile.tile_pack %[[B_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
// CHECK: %[[RES1:.*]] = arith.addf %[[A1]], %[[B1]] : vector<64x2x1x32xf16>
// CHECK: %[[RES_UNP1:.*]] = xetile.tile_unpack %[[RES1]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
// CHECK: %[[A2:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
// CHECK: %[[RES2:.*]] = arith.negf %[[A2]] : vector<64x2x1x32xf16>
// CHECK: %[[RES_UNP2:.*]] = xetile.tile_unpack %[[RES2]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
// CHECK: %[[A3:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
// CHECK: %[[RES3:.*]] = math.exp %[[A3]] : vector<64x2x1x32xf16>
// CHECK: %[[RES_UNP3:.*]] = xetile.tile_unpack %[[RES3]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
// CHECK: return %[[RES_UNP1]], %[[RES_UNP2]], %[[RES_UNP3]] : vector<64x64xf16>, vector<64x64xf16>, vector<64x64xf16>
func.func @test_blocking_elementwise(%a: vector<64x64xf16>, %b: vector<64x64xf16>) -> (vector<64x64xf16>, vector<64x64xf16>, vector<64x64xf16>) {
// Elementwise arith ops are handled in unified way, check some
Expand Down
56 changes: 28 additions & 28 deletions test/Dialect/XeTile/Transforms/blocking.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ gpu.module @test_kernel {
//CHECK: gpu.func @create_mask
//CHECK-DAG: %[[C20:.*]] = arith.constant 20
//CHECK-DAG: %[[C32:.*]] = arith.constant 32
//CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C20]], %[[C32]], %[[C20]] : vector<32x2x1x16xi1>
//CHECK: arith.select %[[MASK]], {{.*}} : vector<32x2x1x16xi1>, vector<32x2x1x16xf16>
//CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C20]], %[[C32]], %[[C20]] : vector<32x1x1x32xi1>
//CHECK: arith.select %[[MASK]], {{.*}} : vector<32x1x1x32xi1>, vector<32x1x1x32xf16>
gpu.func @create_mask(%a: vector<32x32xf16>, %b: vector<32x32xf16>, %c: memref<32x32xf16>) {
%c32 = arith.constant 32 : index
%c20 = arith.constant 20 : index
Expand Down Expand Up @@ -182,11 +182,11 @@ gpu.module @test_kernel {
// CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 } : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 32]>> -> vector<1x1x16x32xf16>
// CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 32] } : vector<1x1x16x32xf16> -> vector<16x32xf16>
%v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
// CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] } : vector<16x32xf16> -> vector<16x2x1x16xf16>
// CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
// CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 32] } : vector<16x32xf16> -> vector<16x1x1x32xf16>
// CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x1x1x32xf16>
%e = math.exp %v: vector<16x32xf16>
// CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<16xf16> to vector<16x1xf16>
// CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [1, 3] : vector<16x2x1x16xf16> to vector<16x1xf16>
// CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [1, 3] : vector<16x1x1x32xf16> to vector<16x1xf16>
// CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<16x1xf16> to vector<16xf16>
%r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
// CHECK: %[[R8:.*]] = vector.shape_cast %[[R7]] : vector<16xf16> to vector<2x8xf16>
Expand All @@ -210,12 +210,12 @@ gpu.module @test_kernel {
//CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 } : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 32]>> -> vector<1x1x16x32xf16>
//CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 32] } : vector<1x1x16x32xf16> -> vector<16x32xf16>
%v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
//CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] } : vector<16x32xf16> -> vector<16x2x1x16xf16>
//CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
//CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 32] } : vector<16x32xf16> -> vector<16x1x1x32xf16>
//CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x1x1x32xf16>
%e = math.exp %v: vector<16x32xf16>
//CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<32xf16> to vector<2x16xf16>
//CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [0, 2] : vector<16x2x1x16xf16> to vector<2x16xf16>
//CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<2x16xf16> to vector<32xf16>
//CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<32xf16> to vector<1x32xf16>
//CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [0, 2] : vector<16x1x1x32xf16> to vector<1x32xf16>
//CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<1x32xf16> to vector<32xf16>
%r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
//CHECK: %[[R8:.*]] = vector.shape_cast %[[R7]] : vector<32xf16> to vector<4x8xf16>
%c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
Expand Down Expand Up @@ -298,18 +298,18 @@ gpu.module @test_kernel {
//CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] } : vector<1x2x32x32xf16> -> vector<32x64xf16>
%2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>

//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] } : vector<32x64xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] } : vector<32x64xf16> -> vector<32x2x1x32xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
%3 = math.exp %2: vector<32x64xf16>

//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [0, 2] : vector<32x2x1x32xf16> -> vector<1x2x1x32xf16>
%4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>

//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [0, 2] : vector<1x4x1x16xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [0, 2] : vector<1x2x1x32xf16> -> vector<32x2x1x32xf16>
%5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>

//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] } : vector<32x4x1x16xf16> -> vector<32x64xf16>
//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] } : vector<32x2x1x32xf16> -> vector<32x64xf16>
%6 = arith.divf %3, %5: vector<32x64xf16>

//CHECK: %[[r9:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
Expand All @@ -332,18 +332,18 @@ gpu.module @test_kernel {
//CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] } : vector<1x2x32x32xf16> -> vector<32x64xf16>
%2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>

//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] } : vector<32x64xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] } : vector<32x64xf16> -> vector<32x2x1x32xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
%3 = math.exp %2: vector<32x64xf16>

//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x2x1x32xf16> -> vector<32x1x1x1xf16>
%4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>

//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x2x1x32xf16>
%5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>

//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] } : vector<32x4x1x16xf16> -> vector<32x64xf16>
//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] } : vector<32x2x1x32xf16> -> vector<32x64xf16>
%6 = arith.divf %3, %5: vector<32x64xf16>

//CHECK: %[[r9:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
Expand All @@ -366,18 +366,18 @@ gpu.module @test_kernel {
//CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] } : vector<1x2x32x32xf16> -> vector<32x64xf16>
%2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>

//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] } : vector<32x64xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
//CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] } : vector<32x64xf16> -> vector<32x2x1x32xf16>
//CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
%3 = math.exp %2: vector<32x64xf16>

//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
//CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x2x1x32xf16> -> vector<32x1x1x1xf16>
%4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>

//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
//CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x2x1x32xf16>
%5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>

//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] } : vector<32x4x1x16xf16> -> vector<32x64xf16>
//CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
//CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] } : vector<32x2x1x32xf16> -> vector<32x64xf16>
%6 = arith.divf %3, %5: vector<32x64xf16>

//CHECK: %[[r9:.*]] = xetile.tile_pack %[[r8]] { inner_blocks = [8, 16] } : vector<32x64xf16> -> vector<4x4x8x16xf16>
Expand Down

0 comments on commit d693236

Please sign in to comment.