Elementwise inner-block size (#876)

Compute the inner-block size of elementwise ops based on the register size.
intel · Sep 13, 2024 · d693236 · d693236
1 parent 3ffa8bf
commit d693236
Show file tree

Hide file tree

Showing 7 changed files with 401 additions and 530 deletions.
diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp
@@ -164,8 +164,7 @@ getInnerBlockSizes(mlir::Operation *operation, mlir::Type elemTy, int height,
   }
 
   if (op == OpType::Elementwise) {
-    // TODO: get from uArch?
-    int64_t subgroupSize = 16;
+    int64_t subgroupSize = uArchInterface->getOneGRFSizeBits() / elementSize;
 
     maxHeight = 1;
     minHeight = 1;

diff --git a/test/Conversion/XeTileToXeGPU/gemm_preop.mlir b/test/Conversion/XeTileToXeGPU/gemm_preop.mlir
@@ -63,8 +63,7 @@ module attributes {gpu.container_module} {
         %28 = xetile.load_tile %arg6 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16> -> vector<32x32xf16>
         xegpu.compile_hint
         //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
-        //CHECK-COUNT-64: {{.*}} = vector.extract_strided_slice {{.*}} {offsets = [{{.*}}], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf16> to vector<1x16xf16>
-        //CHECK-COUNT-64: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<1x16xf16>
+        //CHECK-COUNT-32: {{.*}} = arith.addf {{.*}}, {{.*}} : vector<1x32xf16>
         %29 = arith.addf %27, %27 : vector<32x32xf16>
         xegpu.compile_hint
         %30 = xetile.update_tile_offset %arg5, [%c0,  %c32] : !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>

diff --git a/test/Conversion/XeTileToXeGPU/non_pow2_stacking.mlir b/test/Conversion/XeTileToXeGPU/non_pow2_stacking.mlir
@@ -15,11 +15,11 @@ module @test_module attributes {gpu.container_module} {
       %c_value = arith.addf %a_valuee, %b_valuee : vector<24x32xf16>
 
       // not a power-of-two (size is 6) inner block stacking reduction
-      // CHECK: %[[STACK1:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
-      // CHECK: %[[STACK2:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
-      // CHECK: %[[STACK3:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x16xf16>, vector<1x16xf16>
-      // CHECK: %[[STACK4:.*]] = vector.shuffle %[[STACK1]], %[[STACK2]] [0, 1, 2, 3] : vector<2x16xf16>, vector<2x16xf16>
-      // CHECK: %[[STACK5:.*]] = vector.shuffle %[[STACK4]], %[[STACK3]] [0, 1, 2, 3, 4, 5] : vector<4x16xf16>, vector<2x16xf16>
+      // CHECK: %[[STACK1:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
+      // CHECK: %[[STACK2:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
+      // CHECK: %[[STACK3:.*]] = vector.shuffle {{.*}}, {{.*}} [0, 1] : vector<1x32xf16>, vector<1x32xf16>
+      // CHECK: %[[STACK4:.*]] = vector.shuffle %[[STACK1]], %[[STACK2]] [0, 1, 2, 3] : vector<2x32xf16>, vector<2x32xf16>
+      // CHECK: %[[STACK5:.*]] = vector.shuffle %[[STACK4]], %[[STACK3]] [0, 1, 2, 3, 4, 5] : vector<4x32xf16>, vector<2x32xf16>
 
       %c_valuee = xetile.tile_pack %c_value { inner_blocks = [6, 16] }  : vector<24x32xf16> -> vector<4x2x6x16xf16>
       xetile.store_tile %c_valuee, %b_tile  : vector<4x2x6x16xf16>, !xetile.tile<24x32xf16, #xetile.tile_attr<inner_blocks = [6, 16]>>

diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir
diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
diff --git a/test/Conversion/XeTileToXeGPU/test_blocking.mlir b/test/Conversion/XeTileToXeGPU/test_blocking.mlir
@@ -4,16 +4,16 @@ gpu.module @test_kernel {
 
 // CHECK-LABEL: test_blocking_elementwise
 //  CHECK-SAME: (%[[A_ORIG:.*]]: vector<64x64xf16>, %[[B_ORIG:.*]]: vector<64x64xf16>)
-//       CHECK: %[[A1:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
-//       CHECK: %[[B1:.*]] = xetile.tile_pack %[[B_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
-//       CHECK: %[[RES1:.*]] = arith.addf %[[A1]], %[[B1]] : vector<64x4x1x16xf16>
-//       CHECK: %[[RES_UNP1:.*]] = xetile.tile_unpack %[[RES1]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
-//       CHECK: %[[A2:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
-//       CHECK: %[[RES2:.*]] = arith.negf %[[A2]] : vector<64x4x1x16xf16>
-//       CHECK: %[[RES_UNP2:.*]] = xetile.tile_unpack %[[RES2]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
-//       CHECK: %[[A3:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 16] } : vector<64x64xf16> -> vector<64x4x1x16xf16>
-//       CHECK: %[[RES3:.*]] = math.exp %[[A3]] : vector<64x4x1x16xf16>
-//       CHECK: %[[RES_UNP3:.*]] = xetile.tile_unpack %[[RES3]] { inner_blocks = [1, 16] } : vector<64x4x1x16xf16> -> vector<64x64xf16>
+//       CHECK: %[[A1:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
+//       CHECK: %[[B1:.*]] = xetile.tile_pack %[[B_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
+//       CHECK: %[[RES1:.*]] = arith.addf %[[A1]], %[[B1]] : vector<64x2x1x32xf16>
+//       CHECK: %[[RES_UNP1:.*]] = xetile.tile_unpack %[[RES1]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
+//       CHECK: %[[A2:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
+//       CHECK: %[[RES2:.*]] = arith.negf %[[A2]] : vector<64x2x1x32xf16>
+//       CHECK: %[[RES_UNP2:.*]] = xetile.tile_unpack %[[RES2]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
+//       CHECK: %[[A3:.*]] = xetile.tile_pack %[[A_ORIG]] { inner_blocks = [1, 32] } : vector<64x64xf16> -> vector<64x2x1x32xf16>
+//       CHECK: %[[RES3:.*]] = math.exp %[[A3]] : vector<64x2x1x32xf16>
+//       CHECK: %[[RES_UNP3:.*]] = xetile.tile_unpack %[[RES3]] { inner_blocks = [1, 32] } : vector<64x2x1x32xf16> -> vector<64x64xf16>
 //       CHECK: return %[[RES_UNP1]], %[[RES_UNP2]], %[[RES_UNP3]] : vector<64x64xf16>, vector<64x64xf16>, vector<64x64xf16>
 func.func @test_blocking_elementwise(%a: vector<64x64xf16>, %b: vector<64x64xf16>) -> (vector<64x64xf16>, vector<64x64xf16>, vector<64x64xf16>) {
 // Elementwise arith ops are handled in unified way, check some

diff --git a/test/Dialect/XeTile/Transforms/blocking.mlir b/test/Dialect/XeTile/Transforms/blocking.mlir
@@ -144,8 +144,8 @@ gpu.module @test_kernel {
     //CHECK: gpu.func @create_mask
     //CHECK-DAG: %[[C20:.*]] = arith.constant 20
     //CHECK-DAG: %[[C32:.*]] = arith.constant 32
-    //CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C20]], %[[C32]], %[[C20]] : vector<32x2x1x16xi1>
-    //CHECK: arith.select %[[MASK]], {{.*}} : vector<32x2x1x16xi1>, vector<32x2x1x16xf16>
+    //CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C20]], %[[C32]], %[[C20]] : vector<32x1x1x32xi1>
+    //CHECK: arith.select %[[MASK]], {{.*}} : vector<32x1x1x32xi1>, vector<32x1x1x32xf16>
     gpu.func @create_mask(%a: vector<32x32xf16>, %b: vector<32x32xf16>, %c: memref<32x32xf16>) {
       %c32 = arith.constant 32 : index
       %c20 = arith.constant 20 : index
@@ -182,11 +182,11 @@ gpu.module @test_kernel {
       // CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 32]>> -> vector<1x1x16x32xf16>
       // CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 32] }  : vector<1x1x16x32xf16> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
-      // CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<16x32xf16> -> vector<16x2x1x16xf16>
-      // CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
+      // CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 32] }  : vector<16x32xf16> -> vector<16x1x1x32xf16>
+      // CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x1x1x32xf16>
       %e = math.exp %v: vector<16x32xf16>
       // CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<16xf16> to vector<16x1xf16>
-      // CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [1, 3] : vector<16x2x1x16xf16> to vector<16x1xf16>
+      // CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [1, 3] : vector<16x1x1x32xf16> to vector<16x1xf16>
       // CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<16x1xf16> to vector<16xf16>
       %r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
       // CHECK: %[[R8:.*]] = vector.shape_cast %[[R7]] : vector<16xf16> to vector<2x8xf16>
@@ -210,12 +210,12 @@ gpu.module @test_kernel {
       //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 32]>> -> vector<1x1x16x32xf16>
       //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 32] }  : vector<1x1x16x32xf16> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
-      //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<16x32xf16> -> vector<16x2x1x16xf16>
-      //CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
+      //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 32] }  : vector<16x32xf16> -> vector<16x1x1x32xf16>
+      //CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x1x1x32xf16>
       %e = math.exp %v: vector<16x32xf16>
-      //CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<32xf16> to vector<2x16xf16>
-      //CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [0, 2] : vector<16x2x1x16xf16> to vector<2x16xf16>
-      //CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<2x16xf16> to vector<32xf16>
+      //CHECK: %[[R5:.*]] = vector.shape_cast %[[cst]] : vector<32xf16> to vector<1x32xf16>
+      //CHECK: %[[R6:.*]] = vector.multi_reduction <add>, %[[R4]], %[[R5]] [0, 2] : vector<16x1x1x32xf16> to vector<1x32xf16>
+      //CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<1x32xf16> to vector<32xf16>
       %r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
       //CHECK: %[[R8:.*]] = vector.shape_cast %[[R7]] : vector<32xf16> to vector<4x8xf16>
       %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
@@ -298,18 +298,18 @@ gpu.module @test_kernel {
       //CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] }  : vector<1x2x32x32xf16> -> vector<32x64xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
-      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
+      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] }  : vector<32x64xf16> -> vector<32x2x1x32xf16>
+      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [0, 2] : vector<32x2x1x32xf16> -> vector<1x2x1x32xf16>
       %4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
 
-      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [0, 2] : vector<1x4x1x16xf16> -> vector<32x4x1x16xf16>
+      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [0, 2] : vector<1x2x1x32xf16> -> vector<32x2x1x32xf16>
       %5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
-      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
+      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
+      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] }  : vector<32x2x1x32xf16> -> vector<32x64xf16>
       %6 = arith.divf %3, %5: vector<32x64xf16>
 
       //CHECK: %[[r9:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
@@ -332,18 +332,18 @@ gpu.module @test_kernel {
       //CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] }  : vector<1x2x32x32xf16> -> vector<32x64xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
-      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
+      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] }  : vector<32x64xf16> -> vector<32x2x1x32xf16>
+      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x2x1x32xf16> -> vector<32x1x1x1xf16>
       %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
-      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
+      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x2x1x32xf16>
       %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
-      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
+      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
+      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] }  : vector<32x2x1x32xf16> -> vector<32x64xf16>
       %6 = arith.divf %3, %5: vector<32x64xf16>
 
       //CHECK: %[[r9:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
@@ -366,18 +366,18 @@ gpu.module @test_kernel {
       //CHECK: %[[r2:.*]] = xetile.tile_unpack %[[r1]] { inner_blocks = [32, 32] }  : vector<1x2x32x32xf16> -> vector<32x64xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
-      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
+      //CHECK: %[[r3:.*]] = xetile.tile_pack %[[r2]] { inner_blocks = [1, 32] }  : vector<32x64xf16> -> vector<32x2x1x32xf16>
+      //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x2x1x32xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x2x1x32xf16> -> vector<32x1x1x1xf16>
       %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
-      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
+      //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x2x1x32xf16>
       %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
 
-      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x4x1x16xf16>
-      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
+      //CHECK: %[[r7:.*]] = arith.divf %[[r4]], %[[r6]] : vector<32x2x1x32xf16>
+      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [1, 32] }  : vector<32x2x1x32xf16> -> vector<32x64xf16>
       %6 = arith.divf %3, %5: vector<32x64xf16>
 
       //CHECK: %[[r9:.*]] = xetile.tile_pack %[[r8]] { inner_blocks = [8, 16] }  : vector<32x64xf16> -> vector<4x4x8x16xf16>