jeffbolznv commited on
Commit
e9ee893
·
1 Parent(s): 385f335

vulkan: Implement "fast divide" (mul+shift) for unary ops like copy (llama/10642)

Browse files
ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -353,7 +353,45 @@ struct vk_op_unary_push_constants {
353
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
354
  uint32_t d_offset;
355
  float param1; float param2;
 
 
 
 
 
 
356
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
  struct vk_op_binary_push_constants {
359
  uint32_t ne;
@@ -2914,13 +2952,14 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
2914
  elements = { ne, 1, 1 };
2915
  }
2916
 
2917
- const vk_op_unary_push_constants pc = {
2918
  (uint32_t)ne,
2919
  (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
2920
  (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
2921
  0,
2922
  0.0f, 0.0f,
2923
  };
 
2924
  ggml_vk_sync_buffers(subctx);
2925
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
2926
  }
@@ -4125,7 +4164,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4125
  }
4126
 
4127
  template<typename PC>
4128
- static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) {
4129
  VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
4130
  if (src1 != nullptr) {
4131
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -4165,6 +4204,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4165
  const uint64_t ned3 = dst->ne[3];
4166
  const uint64_t ned = ned0 * ned1;
4167
 
 
 
4168
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
4169
 
4170
  if (pipeline == nullptr) {
 
353
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
354
  uint32_t d_offset;
355
  float param1; float param2;
356
+ uint32_t ne0_012mp; uint32_t ne0_012L;
357
+ uint32_t ne0_01mp; uint32_t ne0_01L;
358
+ uint32_t ne0_0mp; uint32_t ne0_0L;
359
+ uint32_t ne1_012mp; uint32_t ne1_012L;
360
+ uint32_t ne1_01mp; uint32_t ne1_01L;
361
+ uint32_t ne1_0mp; uint32_t ne1_0L;
362
  };
363
+ static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
364
+
365
+ // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
366
+ // Precompute mp (m' in the paper) and L such that division
367
+ // can be computed using a multiply (high 32b of 64b result)
368
+ // and a shift:
369
+ //
370
+ // n/d = (mulhi(n, mp) + n) >> L;
371
+ void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
372
+ {
373
+ // compute L = ceil(log2(d));
374
+ L = 0;
375
+ while (L < 32 && (uint32_t{1} << L) < d) {
376
+ L++;
377
+ }
378
+
379
+ mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
380
+ }
381
+
382
+ template <typename T> void init_pushconst_fastdiv(T &p) {
383
+ static_assert(!std::is_const<T>::value, "unexpected type");
384
+ }
385
+
386
+ template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
387
+ // Compute magic values to divide by these six numbers.
388
+ init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L);
389
+ init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L);
390
+ init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L);
391
+ init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L);
392
+ init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L);
393
+ init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L);
394
+ }
395
 
396
  struct vk_op_binary_push_constants {
397
  uint32_t ne;
 
2952
  elements = { ne, 1, 1 };
2953
  }
2954
 
2955
+ vk_op_unary_push_constants pc = {
2956
  (uint32_t)ne,
2957
  (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
2958
  (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
2959
  0,
2960
  0.0f, 0.0f,
2961
  };
2962
+ init_pushconst_fastdiv(pc);
2963
  ggml_vk_sync_buffers(subctx);
2964
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
2965
  }
 
4164
  }
4165
 
4166
  template<typename PC>
4167
+ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
4168
  VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
4169
  if (src1 != nullptr) {
4170
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
 
4204
  const uint64_t ned3 = dst->ne[3];
4205
  const uint64_t ned = ned0 * ned1;
4206
 
4207
+ init_pushconst_fastdiv(pc);
4208
+
4209
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
4210
 
4211
  if (pipeline == nullptr) {
ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp CHANGED
@@ -8,6 +8,13 @@ layout (push_constant) uniform parameter
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
  uint d_offset;
10
  float param1; float param2;
 
 
 
 
 
 
 
11
  } p;
12
 
13
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -17,22 +24,30 @@ uint get_idx() {
17
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
18
  }
19
 
 
 
 
 
 
 
 
 
20
  uint src0_idx(uint idx) {
21
- const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
22
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
23
- const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
24
  const uint i02_offset = i02*p.ne01*p.ne00;
25
- const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
26
  const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
27
  return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
28
  }
29
 
30
  uint dst_idx(uint idx) {
31
- const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
32
  const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
33
- const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
34
  const uint i12_offset = i12*p.ne11*p.ne10;
35
- const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
36
  const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
37
  return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
38
  }
 
8
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9
  uint d_offset;
10
  float param1; float param2;
11
+
12
+ uint ne0_012mp; uint ne0_012L;
13
+ uint ne0_01mp; uint ne0_01L;
14
+ uint ne0_0mp; uint ne0_0L;
15
+ uint ne1_012mp; uint ne1_012L;
16
+ uint ne1_01mp; uint ne1_01L;
17
+ uint ne1_0mp; uint ne1_0L;
18
  } p;
19
 
20
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 
24
  return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
25
  }
26
 
27
+ // see init_fastdiv_values in ggml-vulkan.cpp
28
+ uint fastdiv(uint n, uint mp, uint L) {
29
+ uint msbs, lsbs;
30
+ // msbs = mulhi(n, mp)
31
+ umulExtended(n, mp, msbs, lsbs);
32
+ return (msbs + n) >> L;
33
+ }
34
+
35
  uint src0_idx(uint idx) {
36
+ const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
37
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
38
+ const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
39
  const uint i02_offset = i02*p.ne01*p.ne00;
40
+ const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
41
  const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
42
  return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
43
  }
44
 
45
  uint dst_idx(uint idx) {
46
+ const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
47
  const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
48
+ const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
49
  const uint i12_offset = i12*p.ne11*p.ne10;
50
+ const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
51
  const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
52
  return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
53
  }