Merge remote-tracking branch 'upstream/main' into main

intel · Aug 22, 2023 · 2b30545 · 2b30545
2 parents 4d83590 + d1bfdcf
commit 2b30545
Show file tree

Hide file tree

Showing 3 changed files with 221 additions and 0 deletions.
diff --git a/benchmarks/spirv/perf_report.txt b/benchmarks/spirv/perf_report.txt
@@ -0,0 +1,29 @@
+onednn
+MESA: warning: Driver does not support the 0x20a PCI ID.
+Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%-time%,%-Gflops%,%0time%,%0Gflops%
+perf,gpu,ocl:gen9:any,,--mode=P --eltwise --engine=gpu --dir=FWD_I --tag=axb --alg=relu --alpha=0 --beta=0 512x640x20x15,0,2.26288,0,2.42245,0
+tests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0
+total perf: min(ms):2.26288 avg(ms):2.42245
+
+
+relu.tile.seq1.simd16.spirv.block.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 1.8140, min: 1.7814, max: 3.6022 (over 100 runs)
+relu.tile.seq1.simd16.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.7702, min: 2.6317, max: 5.5091 (over 100 runs)
+relu.tile.seq1.simd32.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.2354, min: 2.1802, max: 4.4122 (over 100 runs)
+relu.tile.seq2.simd16.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.4374, min: 2.3429, max: 4.9346 (over 100 runs)
+relu.tile.seq2.simd32.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.2564, min: 2.1682, max: 4.4366 (over 100 runs)
+relu.tile.seq4.simd32.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.3925, min: 2.3198, max: 4.8013 (over 100 runs)
+relu.tile.seq4.simd16.spirv.mlir
+MESA: warning: Driver does not support the 0x20a PCI ID.
+the kernel execution time is (ms):avg: 2.3121, min: 2.2325, max: 4.5760 (over 100 runs)
diff --git a/benchmarks/spirv/relu.tile.seq1.simd16.spirv.block.mlir b/benchmarks/spirv/relu.tile.seq1.simd16.spirv.block.mlir
@@ -0,0 +1,80 @@
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime
+module attributes {gpu.container_module, torch.debug_module_name = "ReLU"} {
+  memref.global "private" constant @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32> = dense<1.300000e+00>
+  func.func @forward(%arg0: memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32> attributes {llvm.emit_c_interface} {
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %c96000 = arith.constant 96000 : index
+    %c1024 = arith.constant 1024 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %memref = gpu.alloc  host_shared () : memref<512x640x20x15xf32>
+    memref.copy %arg0, %memref : memref<512x640x20x15xf32> to memref<512x640x20x15xf32>
+    %collapse_shape = memref.collapse_shape %memref [[0, 1, 2, 3]] : memref<512x640x20x15xf32> into memref<98304000xf32>
+    %memref_0 = gpu.alloc  host_shared () : memref<98304000xf32>
+    gpu.launch_func  @forward_kernel::@forward_kernel blocks in (%c96000, %c1, %c1) threads in (%c16, %c64, %c1) args(%c1024 : index, %c64 : index, %collapse_shape : memref<98304000xf32>, %cst : f32, %memref_0 : memref<98304000xf32>)
+    %expand_shape = memref.expand_shape %memref_0 [[0, 1, 2, 3]] : memref<98304000xf32> into memref<512x640x20x15xf32>
+    gpu.dealloc  %memref : memref<512x640x20x15xf32>
+    return %expand_shape : memref<512x640x20x15xf32>
+  }
+    spirv.module @__spv__forward_kernel Physical64 OpenCL requires  #spirv.vce<v1.1, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, Groups, SubgroupDispatch, SubgroupBufferBlockIOINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_subgroups, SPV_KHR_no_integer_wrap_decoration]> {
+    spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input>
+    spirv.GlobalVariable @__builtin_var_SubgroupId__ built_in("SubgroupId") : !spirv.ptr<i32, Input>
+    spirv.func @forward_kernel(%arg0: i64, %arg1: i64, %arg2: !spirv.ptr<!spirv.array<98304000 x f32>, CrossWorkgroup>, %arg3: f32, %arg4: !spirv.ptr<!spirv.array<98304000 x f32>, CrossWorkgroup>) "None" attributes {gpu.known_block_size = array<i32: 32, 16, 1>, gpu.known_grid_size = array<i32: 96000, 1, 1>, workgroup_attributions = 0 : i64} {
+      %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64>
+      %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64>
+      %__builtin_var_SubgroupId___addr = spirv.mlir.addressof @__builtin_var_SubgroupId__ : !spirv.ptr<i32, Input>
+      %2 = spirv.Load "Input" %__builtin_var_SubgroupId___addr : i32
+      %3 = spirv.UConvert %2 : i32 to i64
+      %4 = spirv.IMul %1, %arg0 : i64
+      %5 = spirv.IMul %3, %arg1 : i64
+      %6 = spirv.IAdd %4, %5 : i64
+      %cst0_i64 = spirv.Constant 0 : i64
+      %cst1_i64 = spirv.Constant 1 : i64
+      %7 = spirv.IMul %cst1_i64, %6 : i64
+      %8 = spirv.IAdd %cst0_i64, %7 : i64
+      %9 = spirv.AccessChain %arg2[%8] : !spirv.ptr<!spirv.array<98304000 x f32>, CrossWorkgroup>, i64
+      %91 = spirv.Bitcast %9 : !spirv.ptr<f32, CrossWorkgroup> to !spirv.ptr<i32, CrossWorkgroup>
+      %92 = spirv.INTEL.SubgroupBlockRead "CrossWorkgroup" %91 : i32
+      %10 = spirv.Bitcast %92 : i32 to f32
+      %11 = spirv.FUnordGreaterThan %10, %arg3 : f32
+      %12 = spirv.Select %11, %10, %arg3 : i1, f32
+      %cst0_i64_0 = spirv.Constant 0 : i64
+      %cst1_i64_1 = spirv.Constant 1 : i64
+      %13 = spirv.IMul %cst1_i64_1, %6 : i64
+      %14 = spirv.IAdd %cst0_i64_0, %13 : i64
+      %15 = spirv.AccessChain %arg4[%14] : !spirv.ptr<!spirv.array<98304000 x f32>, CrossWorkgroup>, i64
+      %151 = spirv.Bitcast %15 : !spirv.ptr<f32, CrossWorkgroup> to !spirv.ptr<i32, CrossWorkgroup>
+      %122 = spirv.Bitcast %12 : f32 to i32
+      spirv.INTEL.SubgroupBlockWrite "CrossWorkgroup" %151, %122 : i32
+      spirv.Return
+    }
+    spirv.EntryPoint "Kernel" @forward_kernel, @__builtin_var_WorkgroupId__, @__builtin_var_SubgroupId__
+  spirv.ExecutionMode @forward_kernel "SubgroupSize", 16
+  spirv.ExecutionMode @forward_kernel "ContractionOff"
+  }
+  gpu.module @forward_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.1, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @forward_kernel(%arg0: index, %arg1: index, %arg2: memref<98304000xf32>, %arg3: f32, %arg4: memref<98304000xf32>) kernel attributes {gpu.known_block_size = array<i32: 32, 32, 1>, gpu.known_grid_size = array<i32: 96000, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.thread_id  y
+      %2 = arith.muli %0, %arg0 : index
+      %3 = arith.muli %1, %arg1 : index
+      %4 = arith.addi %2, %3 : index
+      %5 = memref.load %arg2[%4] : memref<98304000xf32>
+      %6 = arith.cmpf ugt, %5, %arg3 : f32
+      %7 = arith.select %6, %5, %arg3 : f32
+      memref.store %7, %arg4[%4] : memref<98304000xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %0 = memref.get_global @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32>
+    %1 = call @forward(%0) : (memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32>
+    return
+  }
+}
diff --git a/test/SPIRV/relu.slm.static.8x32.mlir b/test/SPIRV/relu.slm.static.8x32.mlir
@@ -0,0 +1,112 @@
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @test attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_8x32xf32 : memref<8x32xf32> = dense<[
+      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+      [1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1],
+      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+      [2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1],
+      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+      [-1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1],
+      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+      [-2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1, -2.1]
+    ]>
+  func.func @test(%arg0: memref<8x32xf32>) -> memref<8x32xf32> attributes {llvm.emit_c_interface} {
+    %c32 = arith.constant 32 : index
+    %c8 = arith.constant 8 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<8x32xf32>
+    memref.copy %arg0, %memref : memref<8x32xf32> to memref<8x32xf32>
+    %memref_0 = gpu.alloc  host_shared () : memref<8x32xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c8, %c32, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x32xf32>, %cst : f32, %memref_0 : memref<8x32xf32>)
+    %alloc = memref.alloc() : memref<8x32xf32>
+    memref.copy %memref_0, %alloc : memref<8x32xf32> to memref<8x32xf32>
+    gpu.dealloc  %memref_0 : memref<8x32xf32>
+    gpu.dealloc  %memref : memref<8x32xf32>
+    return %alloc : memref<8x32xf32>
+  }
+  spirv.module @__spv__test_kernel Physical64 OpenCL requires #spirv.vce<v1.0, [Int64, Kernel, Addresses], []> attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    spirv.GlobalVariable @__workgroup_mem__1 : !spirv.ptr<!spirv.array<512 x f32>, Workgroup>
+    spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input>
+    spirv.func @test_kernel(%arg0: !spirv.ptr<!spirv.array<256 x f32>, CrossWorkgroup>, %arg1: f32, %arg2: !spirv.ptr<!spirv.array<256 x f32>, CrossWorkgroup>) "None" attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 8, 32, 1>, workgroup_attributions = 0 : i64} {
+      %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64>
+      %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64>
+      %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64>
+      %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64>
+      %cst0_i64 = spirv.Constant 0 : i64
+      %cst32_i64 = spirv.Constant 32 : i64
+      %4 = spirv.IMul %cst32_i64, %1 : i64
+      %5 = spirv.IAdd %cst0_i64, %4 : i64
+      %cst1_i64 = spirv.Constant 1 : i64
+      %6 = spirv.IMul %cst1_i64, %3 : i64
+      %7 = spirv.IAdd %5, %6 : i64
+      %8 = spirv.AccessChain %arg0[%7] : !spirv.ptr<!spirv.array<256 x f32>, CrossWorkgroup>, i64
+      %9 = spirv.Load "CrossWorkgroup" %8 : f32
+      %10 = spirv.FUnordGreaterThan %9, %arg1 : f32
+      %11 = spirv.Select %10, %9, %arg1 : i1, f32
+      %__workgroup_mem__1_addr = spirv.mlir.addressof @__workgroup_mem__1 : !spirv.ptr<!spirv.array<512 x f32>, Workgroup>
+      %cst0_i64_1 = spirv.Constant 0 : i64
+      %cst32_i64_2 = spirv.Constant 32 : i64
+      %12 = spirv.IMul %cst32_i64_2, %1 : i64
+      %13 = spirv.IAdd %cst0_i64_1, %12 : i64
+      %cst1_i64_3 = spirv.Constant 1 : i64
+      %14 = spirv.IMul %cst1_i64_3, %3 : i64
+      %15 = spirv.IAdd %13, %14 : i64
+      %16 = spirv.AccessChain %__workgroup_mem__1_addr[%15] : !spirv.ptr<!spirv.array<512 x f32>, Workgroup>, i64
+      spirv.Store "Workgroup" %16, %11 : f32
+      spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
+      %cst0_i64_4 = spirv.Constant 0 : i64
+      %cst32_i64_5 = spirv.Constant 32 : i64
+      %17 = spirv.IMul %cst32_i64_5, %1 : i64
+      %18 = spirv.IAdd %cst0_i64_4, %17 : i64
+      %cst1_i64_6 = spirv.Constant 1 : i64
+      %19 = spirv.IMul %cst1_i64_6, %3 : i64
+      %20 = spirv.IAdd %18, %19 : i64
+      %21 = spirv.AccessChain %__workgroup_mem__1_addr[%20] : !spirv.ptr<!spirv.array<512 x f32>, Workgroup>, i64
+      %22 = spirv.Load "Workgroup" %21 : f32
+      %cst0_i64_7 = spirv.Constant 0 : i64
+      %cst32_i64_8 = spirv.Constant 32 : i64
+      %23 = spirv.IMul %cst32_i64_8, %1 : i64
+      %24 = spirv.IAdd %cst0_i64_7, %23 : i64
+      %cst1_i64_9 = spirv.Constant 1 : i64
+      %25 = spirv.IMul %cst1_i64_9, %3 : i64
+      %26 = spirv.IAdd %24, %25 : i64
+      %27 = spirv.AccessChain %arg2[%26] : !spirv.ptr<!spirv.array<256 x f32>, CrossWorkgroup>, i64
+      spirv.Store "CrossWorkgroup" %27, %22 : f32
+      spirv.Return
+    }
+    spirv.EntryPoint "Kernel" @test_kernel, @__builtin_var_WorkgroupId__
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x32xf32>, %arg1: f32, %arg2: memref<8x32xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 8, 32, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<8x32xf32>
+      %3 = arith.cmpf ugt, %2, %arg1 : f32
+      %4 = arith.select %3, %2, %arg1 : f32
+      %alloc = memref.alloc() : memref<16x32xf32, #spirv.storage_class<Workgroup>>
+      memref.store %4, %alloc[%0, %1] : memref<16x32xf32, #spirv.storage_class<Workgroup>>
+      gpu.barrier
+      %5 = memref.load %alloc[%0, %1] : memref<16x32xf32, #spirv.storage_class<Workgroup>>
+      memref.store %5, %arg2[%0, %1] : memref<8x32xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %0 = memref.get_global @__constant_8x32xf32 : memref<8x32xf32>
+    %1 = call @test(%0) : (memref<8x32xf32>) -> memref<8x32xf32>
+    %cast = memref.cast %1 : memref<8x32xf32> to memref<*xf32>
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    // CHECK: [0, 0
+    // CHECK: [1.1, 1.1
+    // CHECK: [0, 0
+    // CHECK: [2.1, 2.1
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}