whisper.cpp

Running

Abhilash Majumder commited on Feb 23, 2024

Commit

9a168fc

unverified ·

1 Parent(s): 53d0282

whisper : add SYCL support (#1863)

* add changes from llama upstream

* add sycl abstraction

* add sycl build

* update cmake

* add sycl build config

* fix bug

* fix bug

* refactor build

* fix bug

* update build

* call build

* use sycl header

* add examples

* add target

* fix typecast in quant.c

* readd fp16 and readme

* fix quant typecast

* add sample

* add readme

* remove cxx file check

Files changed (10) hide show

.github/workflows/build.yml +100 -0
CMakeLists.txt +41 -6
README_sycl.md +249 -0
examples/CMakeLists.txt +3 -0
examples/sycl/CMakeLists.txt +9 -0
examples/sycl/README.md +47 -0
examples/sycl/build.sh +19 -0
examples/sycl/ls-sycl-device.cpp +11 -0
examples/sycl/run-whisper.sh +17 -0
whisper.cpp +14 -0

.github/workflows/build.yml CHANGED Viewed

@@ -150,6 +150,106 @@ jobs:
             make
             ctest -L gh --output-on-failure'
   windows:
     runs-on: windows-latest

             make
             ctest -L gh --output-on-failure'
+  ubuntu-22-cmake-sycl:
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        dwhisper_sycl: [ON]
+        dcmake_c_compiler: [icx]
+        dcmake_cxx_compiler: [icpx]
+        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+    continue-on-error: true
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+  ubuntu-22-cmake-sycl-fp16:
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        dwhisper_sycl: [ON]
+        dcmake_c_compiler: [icx]
+        dcmake_cxx_compiler: [icpx]
+        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+    continue-on-error: true
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
   windows:
     runs-on: windows-latest

CMakeLists.txt CHANGED Viewed

@@ -70,12 +70,14 @@ if (APPLE)
     option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
     option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
-    option(WHISPER_BLAS                  "whisper: use BLAS libraries"  OFF)
-    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
-    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"     OFF)
-    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"  OFF)
-    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS" OFF)
-    option(WHISPER_CLBLAST               "whisper: use CLBlast"         OFF)
 endif()
 option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -106,6 +108,13 @@ endif()
 find_package(Threads REQUIRED)
 # on APPLE
 if (APPLE)
     # include Accelerate framework
@@ -309,6 +318,30 @@ if( WHISPER_OPENVINO )
     find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()
 # compiler flags
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -503,6 +536,8 @@ add_library(${TARGET}
     ${GGML_SOURCES_METAL}
     ${GGML_SOURCES_CUDA}
     ${GGML_SOURCES_OPENCL}
     whisper.h
     whisper.cpp
     )

     option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
     option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
+    option(WHISPER_BLAS                  "whisper: use BLAS libraries"                        OFF)
+    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor"                       Generic)
+    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"                           OFF)
+    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"                        OFF)
+    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS"                       OFF)
+    option(WHISPER_CLBLAST               "whisper: use CLBlast"                               OFF)
+    option(WHISPER_SYCL                  "whisper: use SYCL"                                  OFF)
+    option(WHISPER_SYCL_F16              "whisper: use 16 bit floats for sycl calculations"   OFF)
 endif()
 option(WHISPER_PERF "whisper: enable perf timings" OFF)
 find_package(Threads REQUIRED)
+#compile flag sycl
+if (WHISPER_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()
 # on APPLE
 if (APPLE)
     # include Accelerate framework
     find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()
+if (WHISPER_SYCL)
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    endif()
+    #todo: AOT
+    find_package(IntelSYCL REQUIRED)
+    if (WHISPER_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
+    add_compile_definitions(GGML_USE_SYCL)
+    add_compile_options(-I./) #include DPCT
+    add_compile_options(-I/${SYCL_INCLUDE_DIR})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+    set(GGML_HEADERS_SYCL ggml-sycl.h)
+    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+    set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+endif()
 # compiler flags
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     ${GGML_SOURCES_METAL}
     ${GGML_SOURCES_CUDA}
     ${GGML_SOURCES_OPENCL}
+    ${GGML_SOURCES_SYCL}
+    ${GGML_HEADERS_SYCL}
     whisper.h
     whisper.cpp
     )

README_sycl.md ADDED Viewed

	@@ -0,0 +1,249 @@

+# whisper.cpp for SYCL
+[Background](#background)
+[OS](#os)
+[Intel GPU](#intel-gpu)
+[Linux](#linux)
+[Environment Variable](#environment-variable)
+[Known Issue](#known-issue)
+[Todo](#todo)
+## Background
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators�such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel� DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+The whisper.cpp for SYCL is used to support Intel GPUs.
+For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
+## OS
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+## Intel GPU
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+## Linux
+### Setup Environment
+1. Install Intel GPU driver.
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+Note: for iGPU, please install the client GPU driver.
+b. Add user to group: video, render.
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+Note: re-login to enable it.
+c. Check
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+Output (example):
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+2. Install Intel� oneAPI Base toolkit.
+a. Please follow the procedure in [Get the Intel� oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+Recommend to install to default folder: **/opt/intel/oneapi**.
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+b. Check
+```
+source /opt/intel/oneapi/setvars.sh
+sycl-ls
+```
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+```
+2. Build locally:
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+#for FP16
+#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
+#for FP32
+cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+#build example/main only
+#cmake --build . --config Release --target main
+#build all binary
+cmake --build . --config Release -v
+```
+or
+```
+./examples/sycl/build.sh
+```
+Note:
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+### Run
+1. Put model file to folder **models**
+2. Enable oneAPI running environment
+```
+source /opt/intel/oneapi/setvars.sh
+```
+3. List device ID
+Run without parameter:
+```
+./build/bin/ls-sycl-device
+or
+./build/bin/main
+```
+Check the ID in startup log, like:
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+```
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+4. Set device ID and execute whisper.cpp
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+```
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
+```
+or run by script:
+```
+./examples/sycl/run_whisper.sh
+```
+5. Check the device ID in output
+Like:
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+## Environment Variable
+#### Build
+|Name|Value|Function|
+|-|-|-|
+|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
+|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+#### Running
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+## Known Issue
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+  Miss to enable oneAPI running environment.
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+- Hang during startup
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+  Solution: add **--no-mmap**.
+## Todo
+- Support to build in Windows.
+- Support multiple cards.

examples/CMakeLists.txt CHANGED Viewed

@@ -79,6 +79,9 @@ else()
     add_subdirectory(talk)
     add_subdirectory(talk-llama)
     add_subdirectory(lsp)
 endif()
 add_subdirectory(wchess)

     add_subdirectory(talk)
     add_subdirectory(talk-llama)
     add_subdirectory(lsp)
+    if (LLAMA_SYCL)
+        add_subdirectory(sycl)
+    endif()
 endif()
 add_subdirectory(wchess)

examples/sycl/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/sycl/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# llama.cpp/example/sycl
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+## Tool
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+### ls-sycl-device
+List all SYCL devices with ID, compute capability, max work group size, ect.
+1. Build the llama.cpp for SYCL for all targets.
+2. Enable oneAPI running environment
+```
+source /opt/intel/oneapi/setvars.sh
+```
+3. Execute
+```
+./build/bin/ls-sycl-device
+```
+Check the ID in startup log, like:
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+```
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

examples/sycl/build.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+#for FP16
+#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
+#for FP32
+cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+#build example/main only
+#cmake --build . --config Release --target main
+#build all binary
+cmake --build . --config Release -v

examples/sycl/ls-sycl-device.cpp ADDED Viewed

	@@ -0,0 +1,11 @@

+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+#include "ggml-sycl.h"
+int main(int argc, char ** argv) {
+    ggml_backend_sycl_print_sycl_devices();
+    return 0;
+}

examples/sycl/run-whisper.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+if [ $# -gt 0 ]; then
+    export GGML_SYCL_DEVICE=$1
+else
+    export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav

whisper.cpp CHANGED Viewed

@@ -12,6 +12,10 @@
 #include "ggml-cuda.h"
 #endif
 #ifdef WHISPER_USE_OPENVINO
 #include "openvino/whisper-openvino-encoder.h"
 #endif
@@ -1052,6 +1056,16 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
     }
 #endif
     if (backend_gpu) {
         return backend_gpu;
     }

 #include "ggml-cuda.h"
 #endif
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
 #ifdef WHISPER_USE_OPENVINO
 #include "openvino/whisper-openvino-encoder.h"
 #endif
     }
 #endif
+#ifdef GGML_USE_SYCL
+    if (params.use_gpu) {
+        WHISPER_LOG_INFO("%s: using SYCL backend\n", __func__);
+        backend_gpu = ggml_backend_sycl_init(params.gpu_device);
+        if (!backend_gpu) {
+            WHISPER_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__);
+        }
+    }
+#endif
     if (backend_gpu) {
         return backend_gpu;
     }