Spaces:
Running
Running
Diego Devesa
commited on
Commit
·
1794b43
1
Parent(s):
b3e6ea8
ggml : add predefined list of CPU backend variants to build (llama/10626)
Browse files* ggml : add predefined list of CPU backend variants to build
* update CPU dockerfiles
- ggml/CMakeLists.txt +24 -25
- ggml/src/CMakeLists.txt +35 -0
- ggml/src/ggml-backend-reg.cpp +21 -11
- ggml/src/ggml-cpu/CMakeLists.txt +299 -264
- ggml/src/ggml-cpu/cpu-feats-x86.cpp +54 -29
- ggml/src/ggml-cpu/ggml-cpu.c +1 -1
- ggml/src/ggml-cpu/ggml-cpu.cpp +9 -1
ggml/CMakeLists.txt
CHANGED
|
@@ -92,30 +92,33 @@ else()
|
|
| 92 |
set(INS_ENB ON)
|
| 93 |
endif()
|
| 94 |
|
| 95 |
-
option(GGML_CPU_HBM
|
| 96 |
-
option(GGML_CPU_AARCH64
|
| 97 |
-
|
| 98 |
-
option(
|
| 99 |
-
option(
|
| 100 |
-
option(
|
| 101 |
-
option(
|
| 102 |
-
option(
|
| 103 |
-
option(
|
| 104 |
-
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
| 105 |
-
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
|
| 106 |
-
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
|
| 107 |
-
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
|
| 108 |
-
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
| 109 |
if (NOT MSVC)
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
endif()
|
| 112 |
-
option(GGML_LASX
|
| 113 |
-
option(GGML_LSX
|
| 114 |
-
option(GGML_RVV
|
| 115 |
-
option(GGML_SVE
|
|
|
|
|
|
|
| 116 |
|
| 117 |
if (WIN32)
|
| 118 |
-
set(GGML_WIN_VER "0x602" CACHE STRING
|
| 119 |
endif()
|
| 120 |
|
| 121 |
# ggml core
|
|
@@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
|
| 180 |
set(CMAKE_C_STANDARD 11)
|
| 181 |
set(CMAKE_C_STANDARD_REQUIRED true)
|
| 182 |
|
| 183 |
-
|
| 184 |
-
set(CMAKE_CXX_STANDARD 17)
|
| 185 |
-
else()
|
| 186 |
-
set(CMAKE_CXX_STANDARD 11)
|
| 187 |
-
endif()
|
| 188 |
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
| 189 |
|
| 190 |
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
|
|
| 92 |
set(INS_ENB ON)
|
| 93 |
endif()
|
| 94 |
|
| 95 |
+
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
| 96 |
+
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
| 97 |
+
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
| 98 |
+
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
| 99 |
+
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
| 100 |
+
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
|
| 101 |
+
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
| 102 |
+
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
| 103 |
+
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if (NOT MSVC)
|
| 105 |
+
# in MSVC F16C and FMA is implied with AVX2/AVX512
|
| 106 |
+
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
| 107 |
+
option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
|
| 108 |
+
# MSVC does not seem to support AMX
|
| 109 |
+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
|
| 110 |
+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
|
| 111 |
+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
|
| 112 |
endif()
|
| 113 |
+
option(GGML_LASX "ggml: enable lasx" ON)
|
| 114 |
+
option(GGML_LSX "ggml: enable lsx" ON)
|
| 115 |
+
option(GGML_RVV "ggml: enable rvv" ON)
|
| 116 |
+
option(GGML_SVE "ggml: enable SVE" OFF)
|
| 117 |
+
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
| 118 |
+
|
| 119 |
|
| 120 |
if (WIN32)
|
| 121 |
+
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
| 122 |
endif()
|
| 123 |
|
| 124 |
# ggml core
|
|
|
|
| 183 |
set(CMAKE_C_STANDARD 11)
|
| 184 |
set(CMAKE_C_STANDARD_REQUIRED true)
|
| 185 |
|
| 186 |
+
set(CMAKE_CXX_STANDARD 17)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
| 188 |
|
| 189 |
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -269,7 +269,42 @@ function(ggml_add_backend backend)
|
|
| 269 |
endif()
|
| 270 |
endfunction()
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
ggml_add_backend(CPU)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
ggml_add_backend(BLAS)
|
| 274 |
ggml_add_backend(CANN)
|
| 275 |
ggml_add_backend(CUDA)
|
|
|
|
| 269 |
endif()
|
| 270 |
endfunction()
|
| 271 |
|
| 272 |
+
function(ggml_add_cpu_backend_variant tag_name)
|
| 273 |
+
set(GGML_CPU_TAG_NAME ${tag_name})
|
| 274 |
+
# other: OPENMP LLAMAFILE CPU_HBM
|
| 275 |
+
foreach (feat NATIVE
|
| 276 |
+
AVX AVX2 AVX_VNNI FMA F16C
|
| 277 |
+
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
| 278 |
+
AMX_TILE AMX_INT8 AMX_BF16)
|
| 279 |
+
set(GGML_${feat} OFF)
|
| 280 |
+
endforeach()
|
| 281 |
+
|
| 282 |
+
foreach (feat ${ARGN})
|
| 283 |
+
set(GGML_${feat} ON)
|
| 284 |
+
endforeach()
|
| 285 |
+
|
| 286 |
+
ggml_add_cpu_backend_variant_impl(${tag_name})
|
| 287 |
+
endfunction()
|
| 288 |
+
|
| 289 |
ggml_add_backend(CPU)
|
| 290 |
+
|
| 291 |
+
if (GGML_CPU_ALL_VARIANTS)
|
| 292 |
+
if (NOT GGML_BACKEND_DL)
|
| 293 |
+
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
| 294 |
+
endif()
|
| 295 |
+
ggml_add_cpu_backend_variant(sandybridge AVX)
|
| 296 |
+
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
|
| 297 |
+
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
|
| 298 |
+
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
| 299 |
+
if (NOT MSVC)
|
| 300 |
+
# MSVC doesn't support AVX-VNNI or AMX
|
| 301 |
+
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
|
| 302 |
+
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
| 303 |
+
endif()
|
| 304 |
+
else ()
|
| 305 |
+
ggml_add_cpu_backend_variant_impl("")
|
| 306 |
+
endif()
|
| 307 |
+
|
| 308 |
ggml_add_backend(BLAS)
|
| 309 |
ggml_add_backend(CANN)
|
| 310 |
ggml_add_backend(CUDA)
|
ggml/src/ggml-backend-reg.cpp
CHANGED
|
@@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
|
|
| 483 |
best_score = s;
|
| 484 |
best_path = entry.path().string();
|
| 485 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
}
|
| 487 |
}
|
| 488 |
}
|
|
@@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
|
|
| 505 |
}
|
| 506 |
|
| 507 |
void ggml_backend_load_all() {
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
ggml_backend_load_best("
|
| 515 |
-
ggml_backend_load_best("
|
| 516 |
-
ggml_backend_load_best("
|
| 517 |
-
ggml_backend_load_best("
|
| 518 |
-
ggml_backend_load_best("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
}
|
|
|
|
| 483 |
best_score = s;
|
| 484 |
best_path = entry.path().string();
|
| 485 |
}
|
| 486 |
+
} else {
|
| 487 |
+
if (!silent) {
|
| 488 |
+
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
| 489 |
+
}
|
| 490 |
}
|
| 491 |
}
|
| 492 |
}
|
|
|
|
| 509 |
}
|
| 510 |
|
| 511 |
void ggml_backend_load_all() {
|
| 512 |
+
#ifdef NDEBUG
|
| 513 |
+
bool silent = true;
|
| 514 |
+
#else
|
| 515 |
+
bool silent = false;
|
| 516 |
+
#endif
|
| 517 |
+
|
| 518 |
+
ggml_backend_load_best("blas", silent);
|
| 519 |
+
ggml_backend_load_best("cann", silent);
|
| 520 |
+
ggml_backend_load_best("cuda", silent);
|
| 521 |
+
ggml_backend_load_best("hip", silent);
|
| 522 |
+
ggml_backend_load_best("kompute", silent);
|
| 523 |
+
ggml_backend_load_best("metal", silent);
|
| 524 |
+
ggml_backend_load_best("rpc", silent);
|
| 525 |
+
ggml_backend_load_best("sycl", silent);
|
| 526 |
+
ggml_backend_load_best("vulkan", silent);
|
| 527 |
+
ggml_backend_load_best("musa", silent);
|
| 528 |
+
ggml_backend_load_best("cpu", silent);
|
| 529 |
}
|
ggml/src/ggml-cpu/CMakeLists.txt
CHANGED
|
@@ -1,319 +1,354 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
ggml-cpu.c
|
| 5 |
-
ggml-cpu.cpp
|
| 6 |
-
ggml-cpu-aarch64.c
|
| 7 |
-
ggml-cpu-aarch64.h
|
| 8 |
-
ggml-cpu-quants.c
|
| 9 |
-
ggml-cpu-quants.h
|
| 10 |
-
amx/amx.cpp
|
| 11 |
-
amx/amx.h
|
| 12 |
-
amx/mmq.cpp
|
| 13 |
-
amx/mmq.h
|
| 14 |
-
ggml-cpu-impl.h
|
| 15 |
-
)
|
| 16 |
-
|
| 17 |
-
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
|
| 18 |
-
target_include_directories(ggml-cpu PRIVATE .)
|
| 19 |
-
|
| 20 |
-
if (APPLE AND GGML_ACCELERATE)
|
| 21 |
-
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
| 22 |
-
if (ACCELERATE_FRAMEWORK)
|
| 23 |
-
message(STATUS "Accelerate framework found")
|
| 24 |
-
|
| 25 |
-
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
|
| 26 |
-
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
|
| 27 |
-
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
|
| 28 |
-
|
| 29 |
-
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
| 30 |
else()
|
| 31 |
-
|
| 32 |
endif()
|
| 33 |
-
endif()
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
endif()
|
| 46 |
-
endif()
|
| 47 |
-
|
| 48 |
-
if (GGML_LLAMAFILE)
|
| 49 |
-
message(STATUS "Using llamafile")
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
endif()
|
| 57 |
|
| 58 |
-
if (GGML_CPU_HBM)
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
endif()
|
| 67 |
|
| 68 |
-
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
|
| 91 |
-
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
|
| 128 |
-
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
|
| 141 |
-
|
| 142 |
|
| 143 |
-
|
|
|
|
| 144 |
endif ()
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
| 150 |
-
endif()
|
| 151 |
-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
| 152 |
-
# Raspberry Pi 1, Zero
|
| 153 |
-
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
| 154 |
-
endif()
|
| 155 |
-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
| 156 |
-
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
| 157 |
-
# Android armeabi-v7a
|
| 158 |
-
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
| 159 |
-
else()
|
| 160 |
-
# Raspberry Pi 2
|
| 161 |
-
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
| 162 |
endif()
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
| 167 |
-
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
| 168 |
-
endif()
|
| 169 |
-
if (GGML_SVE)
|
| 170 |
-
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
| 171 |
-
endif()
|
| 172 |
-
endif()
|
| 173 |
-
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
| 174 |
-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
| 175 |
-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
| 176 |
-
message(STATUS "x86 detected")
|
| 177 |
-
if (MSVC)
|
| 178 |
-
# instruction set detection for MSVC only
|
| 179 |
-
if (GGML_NATIVE)
|
| 180 |
-
include(cmake/FindSIMD.cmake)
|
| 181 |
-
endif ()
|
| 182 |
-
if (GGML_AVX512)
|
| 183 |
-
list(APPEND ARCH_FLAGS /arch:AVX512)
|
| 184 |
-
# MSVC has no compile-time flags enabling specific
|
| 185 |
-
# AVX512 extensions, neither it defines the
|
| 186 |
-
# macros corresponding to the extensions.
|
| 187 |
-
# Do it manually.
|
| 188 |
-
if (GGML_AVX512_VBMI)
|
| 189 |
-
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
|
| 190 |
-
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 191 |
-
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 192 |
-
endif()
|
| 193 |
endif()
|
| 194 |
-
if (
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
list(APPEND ARCH_FLAGS -
|
|
|
|
|
|
|
|
|
|
| 198 |
endif()
|
| 199 |
endif()
|
| 200 |
-
if (
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
endif()
|
| 205 |
endif()
|
| 206 |
-
if (
|
| 207 |
-
list(APPEND
|
| 208 |
endif()
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
endif()
|
| 212 |
-
if (
|
| 213 |
-
|
|
|
|
| 214 |
endif()
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
endif()
|
| 225 |
endif()
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
if (
|
| 231 |
-
|
| 232 |
-
endif()
|
| 233 |
-
if (GGML_FMA)
|
| 234 |
-
list(APPEND ARCH_FLAGS -mfma)
|
| 235 |
-
endif()
|
| 236 |
-
if (GGML_AVX)
|
| 237 |
-
list(APPEND ARCH_FLAGS -mavx)
|
| 238 |
-
endif()
|
| 239 |
-
if (GGML_AVX2)
|
| 240 |
-
list(APPEND ARCH_FLAGS -mavx2)
|
| 241 |
-
endif()
|
| 242 |
-
if (GGML_AVX_VNNI)
|
| 243 |
-
list(APPEND ARCH_FLAGS -mavxvnni)
|
| 244 |
-
endif()
|
| 245 |
-
if (GGML_AVX512)
|
| 246 |
-
list(APPEND ARCH_FLAGS -mavx512f)
|
| 247 |
-
list(APPEND ARCH_FLAGS -mavx512dq)
|
| 248 |
-
list(APPEND ARCH_FLAGS -mavx512bw)
|
| 249 |
-
endif()
|
| 250 |
-
if (GGML_AVX512_VBMI)
|
| 251 |
-
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 252 |
endif()
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
| 258 |
endif()
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
endif()
|
| 262 |
-
if (
|
| 263 |
-
list(APPEND ARCH_FLAGS -
|
| 264 |
endif()
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
| 267 |
endif()
|
| 268 |
-
endif()
|
| 269 |
-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
| 270 |
-
message(STATUS "PowerPC detected")
|
| 271 |
-
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
| 272 |
-
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
| 273 |
-
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
| 274 |
-
set(substring_index -1)
|
| 275 |
-
endif()
|
| 276 |
-
|
| 277 |
-
if (${substring_index} GREATER_EQUAL 0)
|
| 278 |
-
list(APPEND ARCH_FLAGS -mcpu=power10)
|
| 279 |
-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
| 280 |
-
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
| 281 |
else()
|
| 282 |
-
|
| 283 |
-
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
| 284 |
endif()
|
| 285 |
-
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
| 286 |
-
message(STATUS "loongarch64 detected")
|
| 287 |
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
list(APPEND ARCH_FLAGS -mlasx)
|
| 291 |
endif()
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
endif()
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
| 299 |
endif()
|
| 300 |
-
|
| 301 |
-
message(STATUS "Unknown architecture")
|
| 302 |
-
endif()
|
| 303 |
-
|
| 304 |
-
if (GGML_CPU_AARCH64)
|
| 305 |
-
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
| 306 |
-
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
|
| 307 |
-
endif()
|
| 308 |
-
|
| 309 |
-
target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
|
| 310 |
-
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
|
| 311 |
-
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
|
| 312 |
-
|
| 313 |
-
# the feature detection code must be compiled without any architecture flags
|
| 314 |
-
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
|
| 315 |
-
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
|
| 316 |
-
|
| 317 |
-
if (EMSCRIPTEN)
|
| 318 |
-
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
| 319 |
-
endif()
|
|
|
|
| 1 |
+
function(ggml_add_cpu_backend_variant_impl tag_name)
|
| 2 |
+
if (tag_name)
|
| 3 |
+
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
else()
|
| 5 |
+
set(GGML_CPU_NAME ggml-cpu)
|
| 6 |
endif()
|
|
|
|
| 7 |
|
| 8 |
+
ggml_add_backend_library(${GGML_CPU_NAME})
|
| 9 |
+
|
| 10 |
+
list (APPEND GGML_CPU_SOURCES
|
| 11 |
+
ggml-cpu/ggml-cpu.c
|
| 12 |
+
ggml-cpu/ggml-cpu.cpp
|
| 13 |
+
ggml-cpu/ggml-cpu-aarch64.c
|
| 14 |
+
ggml-cpu/ggml-cpu-aarch64.h
|
| 15 |
+
ggml-cpu/ggml-cpu-quants.c
|
| 16 |
+
ggml-cpu/ggml-cpu-quants.h
|
| 17 |
+
ggml-cpu/amx/amx.cpp
|
| 18 |
+
ggml-cpu/amx/amx.h
|
| 19 |
+
ggml-cpu/amx/mmq.cpp
|
| 20 |
+
ggml-cpu/amx/mmq.h
|
| 21 |
+
ggml-cpu/ggml-cpu-impl.h
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
| 25 |
+
target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
|
| 26 |
+
|
| 27 |
+
if (APPLE AND GGML_ACCELERATE)
|
| 28 |
+
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
| 29 |
+
if (ACCELERATE_FRAMEWORK)
|
| 30 |
+
message(STATUS "Accelerate framework found")
|
| 31 |
+
|
| 32 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
|
| 33 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
|
| 34 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
|
| 35 |
+
|
| 36 |
+
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
|
| 37 |
+
else()
|
| 38 |
+
message(WARNING "Accelerate framework not found")
|
| 39 |
+
endif()
|
| 40 |
+
endif()
|
| 41 |
|
| 42 |
+
if (GGML_OPENMP)
|
| 43 |
+
find_package(OpenMP)
|
| 44 |
+
if (OpenMP_FOUND)
|
| 45 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
| 46 |
|
| 47 |
+
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
| 48 |
+
else()
|
| 49 |
+
message(WARNING "OpenMP not found")
|
| 50 |
+
endif()
|
| 51 |
endif()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
if (GGML_LLAMAFILE)
|
| 54 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
|
| 55 |
|
| 56 |
+
list(APPEND GGML_CPU_SOURCES
|
| 57 |
+
ggml-cpu/llamafile/sgemm.cpp
|
| 58 |
+
ggml-cpu/llamafile/sgemm.h)
|
| 59 |
+
endif()
|
| 60 |
|
| 61 |
+
if (GGML_CPU_HBM)
|
| 62 |
+
find_library(memkind memkind REQUIRED)
|
| 63 |
|
| 64 |
+
message(STATUS "Using memkind for CPU HBM")
|
| 65 |
|
| 66 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
|
| 67 |
|
| 68 |
+
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
|
| 69 |
+
endif()
|
| 70 |
|
| 71 |
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
| 72 |
+
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
| 73 |
+
(NOT CMAKE_OSX_ARCHITECTURES AND
|
| 74 |
+
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
| 75 |
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
| 76 |
|
| 77 |
+
message(STATUS "ARM detected")
|
| 78 |
|
| 79 |
+
if (MSVC)
|
| 80 |
+
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
|
| 81 |
+
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
|
| 82 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
|
| 83 |
|
| 84 |
+
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
| 85 |
+
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
| 86 |
|
| 87 |
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 88 |
+
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 89 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
| 90 |
|
| 91 |
+
message(STATUS "ARM feature DOTPROD enabled")
|
| 92 |
+
endif ()
|
| 93 |
|
| 94 |
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 95 |
|
| 96 |
+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 97 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
| 98 |
|
| 99 |
+
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 100 |
+
endif ()
|
| 101 |
|
| 102 |
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 103 |
+
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 104 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 105 |
|
| 106 |
+
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
|
| 107 |
+
endif ()
|
| 108 |
|
| 109 |
+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
| 110 |
+
elseif (APPLE)
|
| 111 |
+
if (GGML_NATIVE)
|
| 112 |
+
set(USER_PROVIDED_MARCH FALSE)
|
| 113 |
+
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
|
| 114 |
+
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
|
| 115 |
+
set(USER_PROVIDED_MARCH TRUE)
|
| 116 |
+
break()
|
| 117 |
+
endif()
|
| 118 |
+
endforeach()
|
| 119 |
|
| 120 |
+
if (NOT USER_PROVIDED_MARCH)
|
| 121 |
+
set(MARCH_FLAGS "-march=armv8.2a")
|
| 122 |
|
| 123 |
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 124 |
+
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 125 |
+
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
| 126 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
| 127 |
|
| 128 |
+
message(STATUS "ARM feature DOTPROD enabled")
|
| 129 |
+
endif ()
|
| 130 |
|
| 131 |
+
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
|
| 132 |
|
| 133 |
+
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
| 134 |
+
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
|
| 135 |
|
| 136 |
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 137 |
+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 138 |
+
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
| 139 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
| 140 |
|
| 141 |
+
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 142 |
+
endif ()
|
| 143 |
|
| 144 |
+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
| 145 |
|
| 146 |
+
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
|
| 147 |
+
endif ()
|
| 148 |
endif ()
|
| 149 |
+
else()
|
| 150 |
+
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
| 151 |
+
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
| 152 |
+
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
endif()
|
| 154 |
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
| 155 |
+
# Raspberry Pi 1, Zero
|
| 156 |
+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
endif()
|
| 158 |
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
| 159 |
+
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
| 160 |
+
# Android armeabi-v7a
|
| 161 |
+
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
| 162 |
+
else()
|
| 163 |
+
# Raspberry Pi 2
|
| 164 |
+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
| 165 |
endif()
|
| 166 |
endif()
|
| 167 |
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
| 168 |
+
# Android arm64-v8a
|
| 169 |
+
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
| 170 |
+
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
|
|
|
| 171 |
endif()
|
| 172 |
+
if (GGML_SVE)
|
| 173 |
+
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
| 174 |
endif()
|
| 175 |
+
endif()
|
| 176 |
+
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
| 177 |
+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
| 178 |
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
| 179 |
+
if (MSVC)
|
| 180 |
+
# instruction set detection for MSVC only
|
| 181 |
+
if (GGML_NATIVE)
|
| 182 |
+
include(ggml-cpu/cmake/FindSIMD.cmake)
|
| 183 |
+
endif ()
|
| 184 |
+
if (GGML_AVX512)
|
| 185 |
+
list(APPEND ARCH_FLAGS /arch:AVX512)
|
| 186 |
+
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
|
| 187 |
+
# MSVC has no compile-time flags enabling specific
|
| 188 |
+
# AVX512 extensions, neither it defines the
|
| 189 |
+
# macros corresponding to the extensions.
|
| 190 |
+
# Do it manually.
|
| 191 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
|
| 192 |
+
if (GGML_AVX512_VBMI)
|
| 193 |
+
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
|
| 194 |
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 195 |
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 196 |
+
endif()
|
| 197 |
+
endif()
|
| 198 |
+
if (GGML_AVX512_VNNI)
|
| 199 |
+
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
|
| 200 |
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 201 |
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
| 202 |
+
endif()
|
| 203 |
+
endif()
|
| 204 |
+
if (GGML_AVX512_BF16)
|
| 205 |
+
list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
|
| 206 |
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 207 |
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
| 208 |
+
endif()
|
| 209 |
+
endif()
|
| 210 |
+
if (GGML_AMX_TILE)
|
| 211 |
+
list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
|
| 212 |
+
endif()
|
| 213 |
+
if (GGML_AMX_INT8)
|
| 214 |
+
list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
|
| 215 |
+
endif()
|
| 216 |
+
if (GGML_AMX_BF16)
|
| 217 |
+
list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
|
| 218 |
+
endif()
|
| 219 |
+
elseif (GGML_AVX2)
|
| 220 |
+
list(APPEND ARCH_FLAGS /arch:AVX2)
|
| 221 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
|
| 222 |
+
elseif (GGML_AVX)
|
| 223 |
+
list(APPEND ARCH_FLAGS /arch:AVX)
|
| 224 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
| 225 |
+
else ()
|
| 226 |
+
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
| 227 |
+
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
| 228 |
endif()
|
| 229 |
+
if (GGML_AVX_VNNI)
|
| 230 |
+
# MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
|
| 231 |
+
#list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
|
| 232 |
endif()
|
| 233 |
+
else ()
|
| 234 |
+
if (GGML_NATIVE)
|
| 235 |
+
list(APPEND ARCH_FLAGS -march=native)
|
| 236 |
+
else ()
|
| 237 |
+
list(APPEND ARCH_FLAGS -msse4.2)
|
| 238 |
+
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
| 239 |
+
if (GGML_F16C)
|
| 240 |
+
list(APPEND ARCH_FLAGS -mf16c)
|
| 241 |
+
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
| 242 |
+
endif()
|
| 243 |
+
if (GGML_FMA)
|
| 244 |
+
list(APPEND ARCH_FLAGS -mfma)
|
| 245 |
+
list(APPEND ARCH_DEFINITIONS GGML_FMA)
|
| 246 |
+
endif()
|
| 247 |
+
if (GGML_AVX)
|
| 248 |
+
list(APPEND ARCH_FLAGS -mavx)
|
| 249 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
| 250 |
+
endif()
|
| 251 |
+
if (GGML_AVX2)
|
| 252 |
+
list(APPEND ARCH_FLAGS -mavx2)
|
| 253 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX2)
|
| 254 |
+
endif()
|
| 255 |
+
if (GGML_AVX_VNNI)
|
| 256 |
+
list(APPEND ARCH_FLAGS -mavxvnni)
|
| 257 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
|
| 258 |
+
endif()
|
| 259 |
+
if (GGML_AVX512)
|
| 260 |
+
list(APPEND ARCH_FLAGS -mavx512f)
|
| 261 |
+
list(APPEND ARCH_FLAGS -mavx512cd)
|
| 262 |
+
list(APPEND ARCH_FLAGS -mavx512vl)
|
| 263 |
+
list(APPEND ARCH_FLAGS -mavx512dq)
|
| 264 |
+
list(APPEND ARCH_FLAGS -mavx512bw)
|
| 265 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX512)
|
| 266 |
+
endif()
|
| 267 |
+
if (GGML_AVX512_VBMI)
|
| 268 |
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 269 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
|
| 270 |
+
endif()
|
| 271 |
+
if (GGML_AVX512_VNNI)
|
| 272 |
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
| 273 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
|
| 274 |
+
endif()
|
| 275 |
+
if (GGML_AVX512_BF16)
|
| 276 |
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
| 277 |
+
list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
|
| 278 |
+
endif()
|
| 279 |
+
if (GGML_AMX_TILE)
|
| 280 |
+
list(APPEND ARCH_FLAGS -mamx-tile)
|
| 281 |
+
list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
|
| 282 |
+
endif()
|
| 283 |
+
if (GGML_AMX_INT8)
|
| 284 |
+
list(APPEND ARCH_FLAGS -mamx-int8)
|
| 285 |
+
list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
|
| 286 |
+
endif()
|
| 287 |
+
if (GGML_AMX_BF16)
|
| 288 |
+
list(APPEND ARCH_FLAGS -mamx-bf16)
|
| 289 |
+
list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
|
| 290 |
+
endif()
|
| 291 |
endif()
|
| 292 |
endif()
|
| 293 |
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
| 294 |
+
message(STATUS "PowerPC detected")
|
| 295 |
+
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
| 296 |
+
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
| 297 |
+
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
| 298 |
+
set(substring_index -1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
endif()
|
| 300 |
+
|
| 301 |
+
if (${substring_index} GREATER_EQUAL 0)
|
| 302 |
+
list(APPEND ARCH_FLAGS -mcpu=power10)
|
| 303 |
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
| 304 |
+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
| 305 |
+
else()
|
| 306 |
+
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
| 307 |
+
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
| 308 |
endif()
|
| 309 |
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
| 310 |
+
message(STATUS "loongarch64 detected")
|
| 311 |
+
|
| 312 |
+
list(APPEND ARCH_FLAGS -march=loongarch64)
|
| 313 |
+
if (GGML_LASX)
|
| 314 |
+
list(APPEND ARCH_FLAGS -mlasx)
|
| 315 |
endif()
|
| 316 |
+
if (GGML_LSX)
|
| 317 |
+
list(APPEND ARCH_FLAGS -mlsx)
|
| 318 |
endif()
|
| 319 |
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
| 320 |
+
message(STATUS "RISC-V detected")
|
| 321 |
+
if (GGML_RVV)
|
| 322 |
+
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
| 323 |
endif()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
else()
|
| 325 |
+
message(STATUS "Unknown architecture")
|
|
|
|
| 326 |
endif()
|
|
|
|
|
|
|
| 327 |
|
| 328 |
+
if (GGML_CPU_AARCH64)
|
| 329 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
|
|
|
|
| 330 |
endif()
|
| 331 |
+
|
| 332 |
+
message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
|
| 333 |
+
target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
|
| 334 |
+
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
| 335 |
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
| 336 |
+
|
| 337 |
+
if (GGML_BACKEND_DL)
|
| 338 |
+
# The feature detection code is compiled as a separate target so that
|
| 339 |
+
# it can be built without the architecture flags
|
| 340 |
+
# Since multiple variants of the CPU backend may be included in the same
|
| 341 |
+
# build, using set_source_files_properties() to set the arch flags is not possible
|
| 342 |
+
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
| 343 |
+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
| 344 |
+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
| 345 |
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
| 346 |
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
| 347 |
+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
| 348 |
+
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
| 349 |
endif()
|
| 350 |
+
|
| 351 |
+
if (EMSCRIPTEN)
|
| 352 |
+
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
|
|
|
| 353 |
endif()
|
| 354 |
+
endfunction()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-cpu/cpu-feats-x86.cpp
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
#include "ggml-cpu.h"
|
| 2 |
#include "ggml-backend-impl.h"
|
| 3 |
|
| 4 |
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
|
@@ -13,6 +12,7 @@
|
|
| 13 |
#include <array>
|
| 14 |
#include <string>
|
| 15 |
|
|
|
|
| 16 |
struct cpuid_x86 {
|
| 17 |
bool SSE3(void) { return f_1_ecx[0]; }
|
| 18 |
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
|
@@ -50,11 +50,15 @@ struct cpuid_x86 {
|
|
| 50 |
bool INVPCID(void) { return f_7_ebx[10]; }
|
| 51 |
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
| 52 |
bool AVX512F(void) { return f_7_ebx[16]; }
|
|
|
|
| 53 |
bool RDSEED(void) { return f_7_ebx[18]; }
|
| 54 |
bool ADX(void) { return f_7_ebx[19]; }
|
| 55 |
bool AVX512PF(void) { return f_7_ebx[26]; }
|
| 56 |
bool AVX512ER(void) { return f_7_ebx[27]; }
|
| 57 |
bool AVX512CD(void) { return f_7_ebx[28]; }
|
|
|
|
|
|
|
|
|
|
| 58 |
bool SHA(void) { return f_7_ebx[29]; }
|
| 59 |
|
| 60 |
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
|
|
@@ -259,36 +263,57 @@ void test_x86_is() {
|
|
| 259 |
static int ggml_backend_cpu_x86_score() {
|
| 260 |
// FIXME: this does not check for OS support
|
| 261 |
|
|
|
|
| 262 |
cpuid_x86 is;
|
| 263 |
-
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
|
| 264 |
-
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
|
| 265 |
-
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
|
| 266 |
-
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
|
| 267 |
-
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
|
| 268 |
-
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
|
| 269 |
-
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
|
| 270 |
-
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
|
| 271 |
-
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
|
| 272 |
-
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
|
| 273 |
-
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
|
| 274 |
-
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
|
| 275 |
-
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
score +=
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
score +=
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
score +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
return score;
|
| 294 |
}
|
|
|
|
|
|
|
| 1 |
#include "ggml-backend-impl.h"
|
| 2 |
|
| 3 |
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
|
|
|
| 12 |
#include <array>
|
| 13 |
#include <string>
|
| 14 |
|
| 15 |
+
// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
|
| 16 |
struct cpuid_x86 {
|
| 17 |
bool SSE3(void) { return f_1_ecx[0]; }
|
| 18 |
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
|
|
|
| 50 |
bool INVPCID(void) { return f_7_ebx[10]; }
|
| 51 |
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
| 52 |
bool AVX512F(void) { return f_7_ebx[16]; }
|
| 53 |
+
bool AVX512DQ(void) { return f_7_ebx[17]; }
|
| 54 |
bool RDSEED(void) { return f_7_ebx[18]; }
|
| 55 |
bool ADX(void) { return f_7_ebx[19]; }
|
| 56 |
bool AVX512PF(void) { return f_7_ebx[26]; }
|
| 57 |
bool AVX512ER(void) { return f_7_ebx[27]; }
|
| 58 |
bool AVX512CD(void) { return f_7_ebx[28]; }
|
| 59 |
+
bool AVX512BW(void) { return f_7_ebx[30]; }
|
| 60 |
+
bool AVX512VL(void) { return f_7_ebx[31]; }
|
| 61 |
+
|
| 62 |
bool SHA(void) { return f_7_ebx[29]; }
|
| 63 |
|
| 64 |
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
|
|
|
|
| 263 |
static int ggml_backend_cpu_x86_score() {
|
| 264 |
// FIXME: this does not check for OS support
|
| 265 |
|
| 266 |
+
int score = 0;
|
| 267 |
cpuid_x86 is;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
+
#ifdef GGML_FMA
|
| 270 |
+
if (!is.FMA()) { return 0; }
|
| 271 |
+
score += 1;
|
| 272 |
+
#endif
|
| 273 |
+
#ifdef GGML_F16C
|
| 274 |
+
if (!is.F16C()) { return 0; }
|
| 275 |
+
score += 1<<1;
|
| 276 |
+
#endif
|
| 277 |
+
#ifdef GGML_SSE42
|
| 278 |
+
if (!is.SSE42()) { return 0; }
|
| 279 |
+
score += 1<<2;
|
| 280 |
+
#endif
|
| 281 |
+
#ifdef GGML_AVX
|
| 282 |
+
if (!is.AVX()) { return 0; }
|
| 283 |
+
score += 1<<4;
|
| 284 |
+
#endif
|
| 285 |
+
#ifdef GGML_AVX2
|
| 286 |
+
if (!is.AVX2()) { return 0; }
|
| 287 |
+
score += 1<<5;
|
| 288 |
+
#endif
|
| 289 |
+
#ifdef GGML_AVX_VNNI
|
| 290 |
+
if (!is.AVX_VNNI()) { return 0; }
|
| 291 |
+
score += 1<<6;
|
| 292 |
+
#endif
|
| 293 |
+
#ifdef GGML_AVX512
|
| 294 |
+
if (!is.AVX512F()) { return 0; }
|
| 295 |
+
if (!is.AVX512CD()) { return 0; }
|
| 296 |
+
if (!is.AVX512VL()) { return 0; }
|
| 297 |
+
if (!is.AVX512DQ()) { return 0; }
|
| 298 |
+
if (!is.AVX512BW()) { return 0; }
|
| 299 |
+
score += 1<<7;
|
| 300 |
+
#endif
|
| 301 |
+
#ifdef GGML_AVX512_VBMI
|
| 302 |
+
if (!is.AVX512_VBMI()) { return 0; }
|
| 303 |
+
score += 1<<8;
|
| 304 |
+
#endif
|
| 305 |
+
#ifdef GGML_AVX512_BF16
|
| 306 |
+
if (!is.AVX512_BF16()) { return 0; }
|
| 307 |
+
score += 1<<9;
|
| 308 |
+
#endif
|
| 309 |
+
#ifdef GGML_AVX512_VNNI
|
| 310 |
+
if (!is.AVX512_VNNI()) { return 0; }
|
| 311 |
+
score += 1<<10;
|
| 312 |
+
#endif
|
| 313 |
+
#ifdef GGML_AMX_INT8
|
| 314 |
+
if (!is.AMX_INT8()) { return 0; }
|
| 315 |
+
score += 1<<11;
|
| 316 |
+
#endif
|
| 317 |
|
| 318 |
return score;
|
| 319 |
}
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -756,7 +756,7 @@ do { \
|
|
| 756 |
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
| 757 |
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
| 758 |
#else
|
| 759 |
-
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
| 760 |
float tmp[8];
|
| 761 |
|
| 762 |
for (int i = 0; i < 8; i++) {
|
|
|
|
| 756 |
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
| 757 |
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
| 758 |
#else
|
| 759 |
+
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
| 760 |
float tmp[8];
|
| 761 |
|
| 762 |
for (int i = 0; i < 8; i++) {
|
ggml/src/ggml-cpu/ggml-cpu.cpp
CHANGED
|
@@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
| 641 |
if (ggml_cpu_has_llamafile()) {
|
| 642 |
features.push_back({ "LLAMAFILE", "1" });
|
| 643 |
}
|
| 644 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
#ifdef GGML_USE_CPU_AARCH64
|
| 646 |
features.push_back({ "AARCH64_REPACK", "1" });
|
| 647 |
#endif
|
|
|
|
| 641 |
if (ggml_cpu_has_llamafile()) {
|
| 642 |
features.push_back({ "LLAMAFILE", "1" });
|
| 643 |
}
|
| 644 |
+
#ifdef GGML_USE_ACCELERATE
|
| 645 |
+
features.push_back({ "ACCELERATE", "1" });
|
| 646 |
+
#endif
|
| 647 |
+
#ifdef GGML_USE_CPU_HBM
|
| 648 |
+
features.push_back({ "CPU_HBM", "1" });
|
| 649 |
+
#endif
|
| 650 |
+
#ifdef GGML_USE_OPENMP
|
| 651 |
+
features.push_back({ "OPENMP", "1" });
|
| 652 |
+
#endif
|
| 653 |
#ifdef GGML_USE_CPU_AARCH64
|
| 654 |
features.push_back({ "AARCH64_REPACK", "1" });
|
| 655 |
#endif
|