Skip to content

Triton benchmarks

Triton benchmarks #494

name: Triton benchmarks
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
install_ipex:
description: Install Intel PyTorch Extension
type: boolean
default: true
schedule:
- cron: "5 23 * * *"
permissions: read-all
env:
PYTHON_VERSION: "3.10"
USE_IPEX: ${{ github.event_name == 'schedule' && '1' || inputs.install_ipex && '1' || '0' }}
jobs:
build:
name: Triton benchmarks
runs-on:
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Load pip cache
id: pip-cache
uses: ./.github/actions/load
with:
path: $HOME/.cache/pip
# pip cache per commit id just to minimize network traffic
key: pip-$PYTHON_VERSION-$GITHUB_SHA
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Python build dependencies
run: |
pip install wheel cmake
- name: Setup PyTorch with IPEX
if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
uses: ./.github/actions/setup-pytorch
with:
repository: Stonepia/pytorch
- name: Setup PyTorch without IPEX
if: ${{ !(github.event_name == 'schedule' || inputs.install_ipex) }}
uses: ./.github/actions/setup-pytorch
with:
repository: pytorch/pytorch
- name: Setup IPEX
if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
uses: ./.github/actions/setup-ipex
- name: Build Triton wheels
uses: ./.github/actions/setup-triton
with:
command: DEBUG=1 python setup.py bdist_wheel
- name: Install Triton
run: |
pip install python/dist/*.whl
- name: Install benchmark dependencies
run: |
pip install matplotlib pandas tabulate
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmarks
id: install
run: |
cd benchmarks
python setup.py install
- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Default path:
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
TAG=${{ inputs.tag || 'ci' }}-dflt
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
TAG=${{ inputs.tag || 'ci' }}-adv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
IGC_DisableLoopUnroll=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-dflt
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
IGC_DisableLoopUnroll=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-adv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS
- name: Save pip cache
if: ${{ steps.pip-cache.outputs.status == 'miss' }}
uses: ./.github/actions/save
with:
path: ${{ steps.pip-cache.outputs.path }}
dest: ${{ steps.pip-cache.outputs.dest }}
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-reports
path: reports