Diego Devesa commited on
Commit
1794b43
·
1 Parent(s): b3e6ea8

ggml : add predefined list of CPU backend variants to build (llama/10626)

Browse files

* ggml : add predefined list of CPU backend variants to build

* update CPU dockerfiles

ggml/CMakeLists.txt CHANGED
@@ -92,30 +92,33 @@ else()
92
  set(INS_ENB ON)
93
  endif()
94
 
95
- option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
96
- option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
97
-
98
- option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
99
- option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
100
- option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
101
- option(GGML_AVX512 "ggml: enable AVX512" OFF)
102
- option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
103
- option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
104
- option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
105
- option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
106
- option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
107
- option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
108
- option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
109
  if (NOT MSVC)
110
- option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
 
 
 
 
 
 
111
  endif()
112
- option(GGML_LASX "ggml: enable lasx" ON)
113
- option(GGML_LSX "ggml: enable lsx" ON)
114
- option(GGML_RVV "ggml: enable rvv" ON)
115
- option(GGML_SVE "ggml: enable SVE" OFF)
 
 
116
 
117
  if (WIN32)
118
- set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
119
  endif()
120
 
121
  # ggml core
@@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
180
  set(CMAKE_C_STANDARD 11)
181
  set(CMAKE_C_STANDARD_REQUIRED true)
182
 
183
- if (GGML_SYCL)
184
- set(CMAKE_CXX_STANDARD 17)
185
- else()
186
- set(CMAKE_CXX_STANDARD 11)
187
- endif()
188
  set(CMAKE_CXX_STANDARD_REQUIRED true)
189
 
190
  set(THREADS_PREFER_PTHREAD_FLAG ON)
 
92
  set(INS_ENB ON)
93
  endif()
94
 
95
+ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
96
+ option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
97
+ option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
98
+ option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
99
+ option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
100
+ option(GGML_AVX512 "ggml: enable AVX512F" OFF)
101
+ option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
102
+ option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
103
+ option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
 
 
 
 
 
104
  if (NOT MSVC)
105
+ # in MSVC F16C and FMA is implied with AVX2/AVX512
106
+ option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
107
+ option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
108
+ # MSVC does not seem to support AMX
109
+ option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
110
+ option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
111
+ option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
112
  endif()
113
+ option(GGML_LASX "ggml: enable lasx" ON)
114
+ option(GGML_LSX "ggml: enable lsx" ON)
115
+ option(GGML_RVV "ggml: enable rvv" ON)
116
+ option(GGML_SVE "ggml: enable SVE" OFF)
117
+ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
118
+
119
 
120
  if (WIN32)
121
+ set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
122
  endif()
123
 
124
  # ggml core
 
183
  set(CMAKE_C_STANDARD 11)
184
  set(CMAKE_C_STANDARD_REQUIRED true)
185
 
186
+ set(CMAKE_CXX_STANDARD 17)
 
 
 
 
187
  set(CMAKE_CXX_STANDARD_REQUIRED true)
188
 
189
  set(THREADS_PREFER_PTHREAD_FLAG ON)
ggml/src/CMakeLists.txt CHANGED
@@ -269,7 +269,42 @@ function(ggml_add_backend backend)
269
  endif()
270
  endfunction()
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  ggml_add_backend(CPU)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  ggml_add_backend(BLAS)
274
  ggml_add_backend(CANN)
275
  ggml_add_backend(CUDA)
 
269
  endif()
270
  endfunction()
271
 
272
+ function(ggml_add_cpu_backend_variant tag_name)
273
+ set(GGML_CPU_TAG_NAME ${tag_name})
274
+ # other: OPENMP LLAMAFILE CPU_HBM
275
+ foreach (feat NATIVE
276
+ AVX AVX2 AVX_VNNI FMA F16C
277
+ AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
278
+ AMX_TILE AMX_INT8 AMX_BF16)
279
+ set(GGML_${feat} OFF)
280
+ endforeach()
281
+
282
+ foreach (feat ${ARGN})
283
+ set(GGML_${feat} ON)
284
+ endforeach()
285
+
286
+ ggml_add_cpu_backend_variant_impl(${tag_name})
287
+ endfunction()
288
+
289
  ggml_add_backend(CPU)
290
+
291
+ if (GGML_CPU_ALL_VARIANTS)
292
+ if (NOT GGML_BACKEND_DL)
293
+ message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
294
+ endif()
295
+ ggml_add_cpu_backend_variant(sandybridge AVX)
296
+ ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
297
+ ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
298
+ ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
299
+ if (NOT MSVC)
300
+ # MSVC doesn't support AVX-VNNI or AMX
301
+ ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
302
+ ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
303
+ endif()
304
+ else ()
305
+ ggml_add_cpu_backend_variant_impl("")
306
+ endif()
307
+
308
  ggml_add_backend(BLAS)
309
  ggml_add_backend(CANN)
310
  ggml_add_backend(CUDA)
ggml/src/ggml-backend-reg.cpp CHANGED
@@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
483
  best_score = s;
484
  best_path = entry.path().string();
485
  }
 
 
 
 
486
  }
487
  }
488
  }
@@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
505
  }
506
 
507
  void ggml_backend_load_all() {
508
- ggml_backend_load_best("blas", true);
509
- ggml_backend_load_best("cann", true);
510
- ggml_backend_load_best("cuda", true);
511
- ggml_backend_load_best("hip", true);
512
- ggml_backend_load_best("kompute", true);
513
- ggml_backend_load_best("metal", true);
514
- ggml_backend_load_best("rpc", true);
515
- ggml_backend_load_best("sycl", true);
516
- ggml_backend_load_best("vulkan", true);
517
- ggml_backend_load_best("musa", true);
518
- ggml_backend_load_best("cpu", true);
 
 
 
 
 
 
519
  }
 
483
  best_score = s;
484
  best_path = entry.path().string();
485
  }
486
+ } else {
487
+ if (!silent) {
488
+ GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
489
+ }
490
  }
491
  }
492
  }
 
509
  }
510
 
511
  void ggml_backend_load_all() {
512
+ #ifdef NDEBUG
513
+ bool silent = true;
514
+ #else
515
+ bool silent = false;
516
+ #endif
517
+
518
+ ggml_backend_load_best("blas", silent);
519
+ ggml_backend_load_best("cann", silent);
520
+ ggml_backend_load_best("cuda", silent);
521
+ ggml_backend_load_best("hip", silent);
522
+ ggml_backend_load_best("kompute", silent);
523
+ ggml_backend_load_best("metal", silent);
524
+ ggml_backend_load_best("rpc", silent);
525
+ ggml_backend_load_best("sycl", silent);
526
+ ggml_backend_load_best("vulkan", silent);
527
+ ggml_backend_load_best("musa", silent);
528
+ ggml_backend_load_best("cpu", silent);
529
  }
ggml/src/ggml-cpu/CMakeLists.txt CHANGED
@@ -1,319 +1,354 @@
1
- ggml_add_backend_library(ggml-cpu)
2
-
3
- list (APPEND GGML_CPU_SOURCES
4
- ggml-cpu.c
5
- ggml-cpu.cpp
6
- ggml-cpu-aarch64.c
7
- ggml-cpu-aarch64.h
8
- ggml-cpu-quants.c
9
- ggml-cpu-quants.h
10
- amx/amx.cpp
11
- amx/amx.h
12
- amx/mmq.cpp
13
- amx/mmq.h
14
- ggml-cpu-impl.h
15
- )
16
-
17
- target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
18
- target_include_directories(ggml-cpu PRIVATE .)
19
-
20
- if (APPLE AND GGML_ACCELERATE)
21
- find_library(ACCELERATE_FRAMEWORK Accelerate)
22
- if (ACCELERATE_FRAMEWORK)
23
- message(STATUS "Accelerate framework found")
24
-
25
- target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
26
- target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
27
- target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
28
-
29
- target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
30
  else()
31
- message(WARNING "Accelerate framework not found")
32
  endif()
33
- endif()
34
 
35
- if (GGML_OPENMP)
36
- find_package(OpenMP)
37
- if (OpenMP_FOUND)
38
- message(STATUS "OpenMP found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
 
 
 
41
 
42
- target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
43
- else()
44
- message(WARNING "OpenMP not found")
 
45
  endif()
46
- endif()
47
-
48
- if (GGML_LLAMAFILE)
49
- message(STATUS "Using llamafile")
50
 
51
- target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
 
52
 
53
- list(APPEND GGML_CPU_SOURCES
54
- llamafile/sgemm.cpp
55
- llamafile/sgemm.h)
56
- endif()
57
 
58
- if (GGML_CPU_HBM)
59
- find_library(memkind memkind REQUIRED)
60
 
61
- message(STATUS "Using memkind for CPU HBM")
62
 
63
- target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
64
 
65
- target_link_libraries(ggml-cpu PUBLIC memkind)
66
- endif()
67
 
68
- if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
69
- CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
70
- (NOT CMAKE_OSX_ARCHITECTURES AND
71
- NOT CMAKE_GENERATOR_PLATFORM_LWR AND
72
- CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
73
 
74
- message(STATUS "ARM detected")
75
 
76
- if (MSVC)
77
- list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
78
- list(APPEND ARCH_DEFINITIONS __ARM_NEON)
79
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
80
 
81
- set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
82
- string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
83
 
84
- check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
85
- if (GGML_COMPILER_SUPPORT_DOTPROD)
86
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
87
 
88
- message(STATUS "ARM feature DOTPROD enabled")
89
- endif ()
90
 
91
- check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
92
 
93
- if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
94
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
95
 
96
- message(STATUS "ARM feature MATMUL_INT8 enabled")
97
- endif ()
98
 
99
- check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
100
- if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
101
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
102
 
103
- message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
104
- endif ()
105
 
106
- set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
107
- elseif (APPLE)
108
- if (GGML_NATIVE)
109
- set(USER_PROVIDED_MARCH FALSE)
110
- foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
111
- if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
112
- set(USER_PROVIDED_MARCH TRUE)
113
- break()
114
- endif()
115
- endforeach()
116
 
117
- if (NOT USER_PROVIDED_MARCH)
118
- set(MARCH_FLAGS "-march=armv8.2a")
119
 
120
- check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
121
- if (GGML_COMPILER_SUPPORT_DOTPROD)
122
- set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
123
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
124
 
125
- message(STATUS "ARM feature DOTPROD enabled")
126
- endif ()
127
 
128
- set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
129
 
130
- set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
131
- set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
132
 
133
- check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
134
- if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
135
- set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
136
- list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
137
 
138
- message(STATUS "ARM feature MATMUL_INT8 enabled")
139
- endif ()
140
 
141
- set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
142
 
143
- list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
 
144
  endif ()
145
- endif ()
146
- else()
147
- check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
148
- if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
149
- list(APPEND ARCH_FLAGS -mfp16-format=ieee)
150
- endif()
151
- if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
152
- # Raspberry Pi 1, Zero
153
- list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
154
- endif()
155
- if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
156
- if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
157
- # Android armeabi-v7a
158
- list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
159
- else()
160
- # Raspberry Pi 2
161
- list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
162
  endif()
163
- endif()
164
- if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
165
- # Android arm64-v8a
166
- # Raspberry Pi 3, 4, Zero 2 (32-bit)
167
- list(APPEND ARCH_FLAGS -mno-unaligned-access)
168
- endif()
169
- if (GGML_SVE)
170
- list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
171
- endif()
172
- endif()
173
- elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
174
- (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
175
- CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
176
- message(STATUS "x86 detected")
177
- if (MSVC)
178
- # instruction set detection for MSVC only
179
- if (GGML_NATIVE)
180
- include(cmake/FindSIMD.cmake)
181
- endif ()
182
- if (GGML_AVX512)
183
- list(APPEND ARCH_FLAGS /arch:AVX512)
184
- # MSVC has no compile-time flags enabling specific
185
- # AVX512 extensions, neither it defines the
186
- # macros corresponding to the extensions.
187
- # Do it manually.
188
- if (GGML_AVX512_VBMI)
189
- list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
190
- if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
191
- list(APPEND ARCH_FLAGS -mavx512vbmi)
192
- endif()
193
  endif()
194
- if (GGML_AVX512_VNNI)
195
- list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
196
- if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
197
- list(APPEND ARCH_FLAGS -mavx512vnni)
 
 
 
198
  endif()
199
  endif()
200
- if (GGML_AVX512_BF16)
201
- list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
202
- if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
203
- list(APPEND ARCH_FLAGS -mavx512bf16)
204
- endif()
205
  endif()
206
- if (GGML_AMX_TILE)
207
- list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
208
  endif()
209
- if (GGML_AMX_INT8)
210
- list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  endif()
212
- if (GGML_AMX_BF16)
213
- list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
 
214
  endif()
215
- elseif (GGML_AVX2)
216
- list(APPEND ARCH_FLAGS /arch:AVX2)
217
- elseif (GGML_AVX)
218
- list(APPEND ARCH_FLAGS /arch:AVX)
219
- endif()
220
- if (GGML_AVX_VNNI)
221
- list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
222
- if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
223
- list(APPEND ARCH_FLAGS -mavxvnni)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  endif()
225
  endif()
226
- else()
227
- if (GGML_NATIVE)
228
- list(APPEND ARCH_FLAGS -march=native)
229
- endif()
230
- if (GGML_F16C)
231
- list(APPEND ARCH_FLAGS -mf16c)
232
- endif()
233
- if (GGML_FMA)
234
- list(APPEND ARCH_FLAGS -mfma)
235
- endif()
236
- if (GGML_AVX)
237
- list(APPEND ARCH_FLAGS -mavx)
238
- endif()
239
- if (GGML_AVX2)
240
- list(APPEND ARCH_FLAGS -mavx2)
241
- endif()
242
- if (GGML_AVX_VNNI)
243
- list(APPEND ARCH_FLAGS -mavxvnni)
244
- endif()
245
- if (GGML_AVX512)
246
- list(APPEND ARCH_FLAGS -mavx512f)
247
- list(APPEND ARCH_FLAGS -mavx512dq)
248
- list(APPEND ARCH_FLAGS -mavx512bw)
249
- endif()
250
- if (GGML_AVX512_VBMI)
251
- list(APPEND ARCH_FLAGS -mavx512vbmi)
252
  endif()
253
- if (GGML_AVX512_VNNI)
254
- list(APPEND ARCH_FLAGS -mavx512vnni)
255
- endif()
256
- if (GGML_AVX512_BF16)
257
- list(APPEND ARCH_FLAGS -mavx512bf16)
 
 
 
258
  endif()
259
- if (GGML_AMX_TILE)
260
- list(APPEND ARCH_FLAGS -mamx-tile)
 
 
 
 
261
  endif()
262
- if (GGML_AMX_INT8)
263
- list(APPEND ARCH_FLAGS -mamx-int8)
264
  endif()
265
- if (GGML_AMX_BF16)
266
- list(APPEND ARCH_FLAGS -mamx-bf16)
 
 
267
  endif()
268
- endif()
269
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
270
- message(STATUS "PowerPC detected")
271
- execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
272
- string(FIND "${POWER10_M}" "POWER10" substring_index)
273
- if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
274
- set(substring_index -1)
275
- endif()
276
-
277
- if (${substring_index} GREATER_EQUAL 0)
278
- list(APPEND ARCH_FLAGS -mcpu=power10)
279
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
280
- list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
281
  else()
282
- list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
283
- # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
284
  endif()
285
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
286
- message(STATUS "loongarch64 detected")
287
 
288
- list(APPEND ARCH_FLAGS -march=loongarch64)
289
- if (GGML_LASX)
290
- list(APPEND ARCH_FLAGS -mlasx)
291
  endif()
292
- if (GGML_LSX)
293
- list(APPEND ARCH_FLAGS -mlsx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  endif()
295
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
296
- message(STATUS "RISC-V detected")
297
- if (GGML_RVV)
298
- list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
299
  endif()
300
- else()
301
- message(STATUS "Unknown architecture")
302
- endif()
303
-
304
- if (GGML_CPU_AARCH64)
305
- message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
306
- target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
307
- endif()
308
-
309
- target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
310
- set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
311
- set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
312
-
313
- # the feature detection code must be compiled without any architecture flags
314
- target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
315
- # target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
316
-
317
- if (EMSCRIPTEN)
318
- set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
319
- endif()
 
1
+ function(ggml_add_cpu_backend_variant_impl tag_name)
2
+ if (tag_name)
3
+ set(GGML_CPU_NAME ggml-cpu-${tag_name})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  else()
5
+ set(GGML_CPU_NAME ggml-cpu)
6
  endif()
 
7
 
8
+ ggml_add_backend_library(${GGML_CPU_NAME})
9
+
10
+ list (APPEND GGML_CPU_SOURCES
11
+ ggml-cpu/ggml-cpu.c
12
+ ggml-cpu/ggml-cpu.cpp
13
+ ggml-cpu/ggml-cpu-aarch64.c
14
+ ggml-cpu/ggml-cpu-aarch64.h
15
+ ggml-cpu/ggml-cpu-quants.c
16
+ ggml-cpu/ggml-cpu-quants.h
17
+ ggml-cpu/amx/amx.cpp
18
+ ggml-cpu/amx/amx.h
19
+ ggml-cpu/amx/mmq.cpp
20
+ ggml-cpu/amx/mmq.h
21
+ ggml-cpu/ggml-cpu-impl.h
22
+ )
23
+
24
+ target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
25
+ target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
26
+
27
+ if (APPLE AND GGML_ACCELERATE)
28
+ find_library(ACCELERATE_FRAMEWORK Accelerate)
29
+ if (ACCELERATE_FRAMEWORK)
30
+ message(STATUS "Accelerate framework found")
31
+
32
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
33
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
34
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
35
+
36
+ target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
37
+ else()
38
+ message(WARNING "Accelerate framework not found")
39
+ endif()
40
+ endif()
41
 
42
+ if (GGML_OPENMP)
43
+ find_package(OpenMP)
44
+ if (OpenMP_FOUND)
45
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
46
 
47
+ target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
48
+ else()
49
+ message(WARNING "OpenMP not found")
50
+ endif()
51
  endif()
 
 
 
 
52
 
53
+ if (GGML_LLAMAFILE)
54
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
55
 
56
+ list(APPEND GGML_CPU_SOURCES
57
+ ggml-cpu/llamafile/sgemm.cpp
58
+ ggml-cpu/llamafile/sgemm.h)
59
+ endif()
60
 
61
+ if (GGML_CPU_HBM)
62
+ find_library(memkind memkind REQUIRED)
63
 
64
+ message(STATUS "Using memkind for CPU HBM")
65
 
66
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
67
 
68
+ target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
69
+ endif()
70
 
71
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
72
+ CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
73
+ (NOT CMAKE_OSX_ARCHITECTURES AND
74
+ NOT CMAKE_GENERATOR_PLATFORM_LWR AND
75
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
76
 
77
+ message(STATUS "ARM detected")
78
 
79
+ if (MSVC)
80
+ list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
81
+ list(APPEND ARCH_DEFINITIONS __ARM_NEON)
82
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
83
 
84
+ set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
85
+ string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
86
 
87
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
88
+ if (GGML_COMPILER_SUPPORT_DOTPROD)
89
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
90
 
91
+ message(STATUS "ARM feature DOTPROD enabled")
92
+ endif ()
93
 
94
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
95
 
96
+ if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
97
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
98
 
99
+ message(STATUS "ARM feature MATMUL_INT8 enabled")
100
+ endif ()
101
 
102
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
103
+ if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
104
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
105
 
106
+ message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
107
+ endif ()
108
 
109
+ set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
110
+ elseif (APPLE)
111
+ if (GGML_NATIVE)
112
+ set(USER_PROVIDED_MARCH FALSE)
113
+ foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
114
+ if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
115
+ set(USER_PROVIDED_MARCH TRUE)
116
+ break()
117
+ endif()
118
+ endforeach()
119
 
120
+ if (NOT USER_PROVIDED_MARCH)
121
+ set(MARCH_FLAGS "-march=armv8.2a")
122
 
123
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
124
+ if (GGML_COMPILER_SUPPORT_DOTPROD)
125
+ set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
126
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
127
 
128
+ message(STATUS "ARM feature DOTPROD enabled")
129
+ endif ()
130
 
131
+ set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
132
 
133
+ set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
134
+ set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
135
 
136
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
137
+ if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
138
+ set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
139
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
140
 
141
+ message(STATUS "ARM feature MATMUL_INT8 enabled")
142
+ endif ()
143
 
144
+ set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
145
 
146
+ list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
147
+ endif ()
148
  endif ()
149
+ else()
150
+ check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
151
+ if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
152
+ list(APPEND ARCH_FLAGS -mfp16-format=ieee)
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  endif()
154
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
155
+ # Raspberry Pi 1, Zero
156
+ list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  endif()
158
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
159
+ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
160
+ # Android armeabi-v7a
161
+ list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
162
+ else()
163
+ # Raspberry Pi 2
164
+ list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
165
  endif()
166
  endif()
167
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
168
+ # Android arm64-v8a
169
+ # Raspberry Pi 3, 4, Zero 2 (32-bit)
170
+ list(APPEND ARCH_FLAGS -mno-unaligned-access)
 
171
  endif()
172
+ if (GGML_SVE)
173
+ list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
174
  endif()
175
+ endif()
176
+ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
177
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
178
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
179
+ if (MSVC)
180
+ # instruction set detection for MSVC only
181
+ if (GGML_NATIVE)
182
+ include(ggml-cpu/cmake/FindSIMD.cmake)
183
+ endif ()
184
+ if (GGML_AVX512)
185
+ list(APPEND ARCH_FLAGS /arch:AVX512)
186
+ # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
187
+ # MSVC has no compile-time flags enabling specific
188
+ # AVX512 extensions, neither it defines the
189
+ # macros corresponding to the extensions.
190
+ # Do it manually.
191
+ list(APPEND ARCH_DEFINITIONS GGML_AVX512)
192
+ if (GGML_AVX512_VBMI)
193
+ list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
194
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
195
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
196
+ endif()
197
+ endif()
198
+ if (GGML_AVX512_VNNI)
199
+ list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
200
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
201
+ list(APPEND ARCH_FLAGS -mavx512vnni)
202
+ endif()
203
+ endif()
204
+ if (GGML_AVX512_BF16)
205
+ list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
206
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
207
+ list(APPEND ARCH_FLAGS -mavx512bf16)
208
+ endif()
209
+ endif()
210
+ if (GGML_AMX_TILE)
211
+ list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
212
+ endif()
213
+ if (GGML_AMX_INT8)
214
+ list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
215
+ endif()
216
+ if (GGML_AMX_BF16)
217
+ list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
218
+ endif()
219
+ elseif (GGML_AVX2)
220
+ list(APPEND ARCH_FLAGS /arch:AVX2)
221
+ list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
222
+ elseif (GGML_AVX)
223
+ list(APPEND ARCH_FLAGS /arch:AVX)
224
+ list(APPEND ARCH_DEFINITIONS GGML_AVX)
225
+ else ()
226
+ list(APPEND ARCH_FLAGS /arch:SSE4.2)
227
+ list(APPEND ARCH_DEFINITIONS GGML_SSE42)
228
  endif()
229
+ if (GGML_AVX_VNNI)
230
+ # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
231
+ #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
232
  endif()
233
+ else ()
234
+ if (GGML_NATIVE)
235
+ list(APPEND ARCH_FLAGS -march=native)
236
+ else ()
237
+ list(APPEND ARCH_FLAGS -msse4.2)
238
+ list(APPEND ARCH_DEFINITIONS GGML_SSE42)
239
+ if (GGML_F16C)
240
+ list(APPEND ARCH_FLAGS -mf16c)
241
+ list(APPEND ARCH_DEFINITIONS GGML_F16C)
242
+ endif()
243
+ if (GGML_FMA)
244
+ list(APPEND ARCH_FLAGS -mfma)
245
+ list(APPEND ARCH_DEFINITIONS GGML_FMA)
246
+ endif()
247
+ if (GGML_AVX)
248
+ list(APPEND ARCH_FLAGS -mavx)
249
+ list(APPEND ARCH_DEFINITIONS GGML_AVX)
250
+ endif()
251
+ if (GGML_AVX2)
252
+ list(APPEND ARCH_FLAGS -mavx2)
253
+ list(APPEND ARCH_DEFINITIONS GGML_AVX2)
254
+ endif()
255
+ if (GGML_AVX_VNNI)
256
+ list(APPEND ARCH_FLAGS -mavxvnni)
257
+ list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
258
+ endif()
259
+ if (GGML_AVX512)
260
+ list(APPEND ARCH_FLAGS -mavx512f)
261
+ list(APPEND ARCH_FLAGS -mavx512cd)
262
+ list(APPEND ARCH_FLAGS -mavx512vl)
263
+ list(APPEND ARCH_FLAGS -mavx512dq)
264
+ list(APPEND ARCH_FLAGS -mavx512bw)
265
+ list(APPEND ARCH_DEFINITIONS GGML_AVX512)
266
+ endif()
267
+ if (GGML_AVX512_VBMI)
268
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
269
+ list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
270
+ endif()
271
+ if (GGML_AVX512_VNNI)
272
+ list(APPEND ARCH_FLAGS -mavx512vnni)
273
+ list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
274
+ endif()
275
+ if (GGML_AVX512_BF16)
276
+ list(APPEND ARCH_FLAGS -mavx512bf16)
277
+ list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
278
+ endif()
279
+ if (GGML_AMX_TILE)
280
+ list(APPEND ARCH_FLAGS -mamx-tile)
281
+ list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
282
+ endif()
283
+ if (GGML_AMX_INT8)
284
+ list(APPEND ARCH_FLAGS -mamx-int8)
285
+ list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
286
+ endif()
287
+ if (GGML_AMX_BF16)
288
+ list(APPEND ARCH_FLAGS -mamx-bf16)
289
+ list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
290
+ endif()
291
  endif()
292
  endif()
293
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
294
+ message(STATUS "PowerPC detected")
295
+ execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
296
+ string(FIND "${POWER10_M}" "POWER10" substring_index)
297
+ if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
298
+ set(substring_index -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  endif()
300
+
301
+ if (${substring_index} GREATER_EQUAL 0)
302
+ list(APPEND ARCH_FLAGS -mcpu=power10)
303
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
304
+ list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
305
+ else()
306
+ list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
307
+ # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
308
  endif()
309
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
310
+ message(STATUS "loongarch64 detected")
311
+
312
+ list(APPEND ARCH_FLAGS -march=loongarch64)
313
+ if (GGML_LASX)
314
+ list(APPEND ARCH_FLAGS -mlasx)
315
  endif()
316
+ if (GGML_LSX)
317
+ list(APPEND ARCH_FLAGS -mlsx)
318
  endif()
319
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
320
+ message(STATUS "RISC-V detected")
321
+ if (GGML_RVV)
322
+ list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
323
  endif()
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  else()
325
+ message(STATUS "Unknown architecture")
 
326
  endif()
 
 
327
 
328
+ if (GGML_CPU_AARCH64)
329
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
 
330
  endif()
331
+
332
+ message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
333
+ target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
334
+ target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
335
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
336
+
337
+ if (GGML_BACKEND_DL)
338
+ # The feature detection code is compiled as a separate target so that
339
+ # it can be built without the architecture flags
340
+ # Since multiple variants of the CPU backend may be included in the same
341
+ # build, using set_source_files_properties() to set the arch flags is not possible
342
+ set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
343
+ add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
344
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
345
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
346
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
347
+ set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
348
+ target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
349
  endif()
350
+
351
+ if (EMSCRIPTEN)
352
+ set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
 
353
  endif()
354
+ endfunction()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cpu/cpu-feats-x86.cpp CHANGED
@@ -1,4 +1,3 @@
1
- #include "ggml-cpu.h"
2
  #include "ggml-backend-impl.h"
3
 
4
  #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
@@ -13,6 +12,7 @@
13
  #include <array>
14
  #include <string>
15
 
 
16
  struct cpuid_x86 {
17
  bool SSE3(void) { return f_1_ecx[0]; }
18
  bool PCLMULQDQ(void) { return f_1_ecx[1]; }
@@ -50,11 +50,15 @@ struct cpuid_x86 {
50
  bool INVPCID(void) { return f_7_ebx[10]; }
51
  bool RTM(void) { return is_intel && f_7_ebx[11]; }
52
  bool AVX512F(void) { return f_7_ebx[16]; }
 
53
  bool RDSEED(void) { return f_7_ebx[18]; }
54
  bool ADX(void) { return f_7_ebx[19]; }
55
  bool AVX512PF(void) { return f_7_ebx[26]; }
56
  bool AVX512ER(void) { return f_7_ebx[27]; }
57
  bool AVX512CD(void) { return f_7_ebx[28]; }
 
 
 
58
  bool SHA(void) { return f_7_ebx[29]; }
59
 
60
  bool PREFETCHWT1(void) { return f_7_ecx[0]; }
@@ -259,36 +263,57 @@ void test_x86_is() {
259
  static int ggml_backend_cpu_x86_score() {
260
  // FIXME: this does not check for OS support
261
 
 
262
  cpuid_x86 is;
263
- // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
264
- if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
265
- if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
266
- if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
267
- if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
268
- if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
269
- if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
270
- if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
271
- if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
272
- if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
273
- if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
274
- if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
275
- if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
276
 
277
- // calculate a backend score based on the supported features
278
- // more important features have a higher weight
279
- int score = 0;
280
- score += ggml_cpu_has_fma () * 1;
281
- score += ggml_cpu_has_f16c () * 1<<1;
282
- score += ggml_cpu_has_ssse3 () * 1<<2;
283
- score += ggml_cpu_has_sse3 () * 1<<3;
284
- score += ggml_cpu_has_avx_vnni () * 1<<4;
285
- score += ggml_cpu_has_avx () * 1<<5;
286
- score += ggml_cpu_has_avx2 () * 1<<6;
287
- score += ggml_cpu_has_avx512 () * 1<<7;
288
- // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
289
- score += ggml_cpu_has_avx512_bf16() * 1<<9;
290
- score += ggml_cpu_has_avx512_vnni() * 1<<10;
291
- score += ggml_cpu_has_amx_int8 () * 1<<11;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  return score;
294
  }
 
 
1
  #include "ggml-backend-impl.h"
2
 
3
  #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 
12
  #include <array>
13
  #include <string>
14
 
15
+ // ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
16
  struct cpuid_x86 {
17
  bool SSE3(void) { return f_1_ecx[0]; }
18
  bool PCLMULQDQ(void) { return f_1_ecx[1]; }
 
50
  bool INVPCID(void) { return f_7_ebx[10]; }
51
  bool RTM(void) { return is_intel && f_7_ebx[11]; }
52
  bool AVX512F(void) { return f_7_ebx[16]; }
53
+ bool AVX512DQ(void) { return f_7_ebx[17]; }
54
  bool RDSEED(void) { return f_7_ebx[18]; }
55
  bool ADX(void) { return f_7_ebx[19]; }
56
  bool AVX512PF(void) { return f_7_ebx[26]; }
57
  bool AVX512ER(void) { return f_7_ebx[27]; }
58
  bool AVX512CD(void) { return f_7_ebx[28]; }
59
+ bool AVX512BW(void) { return f_7_ebx[30]; }
60
+ bool AVX512VL(void) { return f_7_ebx[31]; }
61
+
62
  bool SHA(void) { return f_7_ebx[29]; }
63
 
64
  bool PREFETCHWT1(void) { return f_7_ecx[0]; }
 
263
  static int ggml_backend_cpu_x86_score() {
264
  // FIXME: this does not check for OS support
265
 
266
+ int score = 0;
267
  cpuid_x86 is;
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ #ifdef GGML_FMA
270
+ if (!is.FMA()) { return 0; }
271
+ score += 1;
272
+ #endif
273
+ #ifdef GGML_F16C
274
+ if (!is.F16C()) { return 0; }
275
+ score += 1<<1;
276
+ #endif
277
+ #ifdef GGML_SSE42
278
+ if (!is.SSE42()) { return 0; }
279
+ score += 1<<2;
280
+ #endif
281
+ #ifdef GGML_AVX
282
+ if (!is.AVX()) { return 0; }
283
+ score += 1<<4;
284
+ #endif
285
+ #ifdef GGML_AVX2
286
+ if (!is.AVX2()) { return 0; }
287
+ score += 1<<5;
288
+ #endif
289
+ #ifdef GGML_AVX_VNNI
290
+ if (!is.AVX_VNNI()) { return 0; }
291
+ score += 1<<6;
292
+ #endif
293
+ #ifdef GGML_AVX512
294
+ if (!is.AVX512F()) { return 0; }
295
+ if (!is.AVX512CD()) { return 0; }
296
+ if (!is.AVX512VL()) { return 0; }
297
+ if (!is.AVX512DQ()) { return 0; }
298
+ if (!is.AVX512BW()) { return 0; }
299
+ score += 1<<7;
300
+ #endif
301
+ #ifdef GGML_AVX512_VBMI
302
+ if (!is.AVX512_VBMI()) { return 0; }
303
+ score += 1<<8;
304
+ #endif
305
+ #ifdef GGML_AVX512_BF16
306
+ if (!is.AVX512_BF16()) { return 0; }
307
+ score += 1<<9;
308
+ #endif
309
+ #ifdef GGML_AVX512_VNNI
310
+ if (!is.AVX512_VNNI()) { return 0; }
311
+ score += 1<<10;
312
+ #endif
313
+ #ifdef GGML_AMX_INT8
314
+ if (!is.AMX_INT8()) { return 0; }
315
+ score += 1<<11;
316
+ #endif
317
 
318
  return score;
319
  }
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -756,7 +756,7 @@ do { \
756
  #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
757
  #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
758
  #else
759
- static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
760
  float tmp[8];
761
 
762
  for (int i = 0; i < 8; i++) {
 
756
  #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
757
  #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
758
  #else
759
+ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
760
  float tmp[8];
761
 
762
  for (int i = 0; i < 8; i++) {
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
641
  if (ggml_cpu_has_llamafile()) {
642
  features.push_back({ "LLAMAFILE", "1" });
643
  }
644
- // TODO: rename this
 
 
 
 
 
 
 
 
645
  #ifdef GGML_USE_CPU_AARCH64
646
  features.push_back({ "AARCH64_REPACK", "1" });
647
  #endif
 
641
  if (ggml_cpu_has_llamafile()) {
642
  features.push_back({ "LLAMAFILE", "1" });
643
  }
644
+ #ifdef GGML_USE_ACCELERATE
645
+ features.push_back({ "ACCELERATE", "1" });
646
+ #endif
647
+ #ifdef GGML_USE_CPU_HBM
648
+ features.push_back({ "CPU_HBM", "1" });
649
+ #endif
650
+ #ifdef GGML_USE_OPENMP
651
+ features.push_back({ "OPENMP", "1" });
652
+ #endif
653
  #ifdef GGML_USE_CPU_AARCH64
654
  features.push_back({ "AARCH64_REPACK", "1" });
655
  #endif