Spaces:
Running
Running
File size: 4,184 Bytes
511930c 58220b6 ade9bc3 5ef1601 05d6d9c 5d037b9 05d6d9c 58220b6 5ef1601 58220b6 0ec1374 5ef1601 0ec1374 58220b6 0ec1374 58220b6 0ec1374 58220b6 0ec1374 58220b6 0ec1374 58220b6 0ec1374 5ef1601 0ec1374 ade9bc3 5ef1601 fc04dc0 5ef1601 0ec1374 5ef1601 0ec1374 5ef1601 4321600 5ef1601 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
#pragma once
#include "llama.h"
#include <memory>
struct llama_ubatch;
class llama_batch_allocr;
class llama_io_write_i;
class llama_io_read_i;
struct llama_memory_params {
// kv cache
ggml_type type_k;
ggml_type type_v;
// use full-size SWA cache
bool swa_full;
};
enum llama_memory_status {
LLAMA_MEMORY_STATUS_SUCCESS = 0,
LLAMA_MEMORY_STATUS_NO_UPDATE,
LLAMA_MEMORY_STATUS_FAILED_PREPARE,
LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
};
// helper function for combining the status of two memory contexts
// useful for implementing hybrid memory types (e.g. iSWA)
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
// helper function for checking if a memory status indicates a failure
bool llama_memory_status_is_fail(llama_memory_status status);
// the interface for managing the memory context during batch processing
// this interface is implemented per memory type. see:
// - llama_kv_cache_unified_context
// - llama_kv_cache_unified_iswa_context
// ...
//
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
struct llama_memory_context_i {
virtual ~llama_memory_context_i() = default;
// consume the current ubatch from the context and proceed to the next one
// return false if we are done
virtual bool next() = 0;
// apply the memory state for the current ubatch to the memory object
// return false on failure
virtual bool apply() = 0;
// get the current ubatch
virtual const llama_ubatch & get_ubatch() const = 0;
// get the status of the memory context - used for error handling and checking if any updates would be applied
virtual llama_memory_status get_status() const = 0;
};
using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i {
virtual ~llama_memory_i() = default;
// split the input batch into a set of ubatches and verify that they can fit into the cache
// return a context object containing the ubatches and memory state required to process them
// check the llama_memory_context_i::get_status() for the result
virtual llama_memory_context_ptr init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
bool embd_all) = 0;
// simulate full cache, used for allocating worst-case compute buffers
virtual llama_memory_context_ptr init_full() = 0;
// prepare for any pending memory updates, such as shifts, defrags, etc.
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
// getters
virtual bool get_can_shift() const = 0;
//
// ops
//
// if data == true, the data buffers will also be cleared together with the metadata
virtual void clear(bool data) = 0;
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
virtual void seq_keep(llama_seq_id seq_id) = 0;
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0;
virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
//
// state write/read
//
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
};
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
// TODO: temporary until the llama_kv_cache is removed from the public API
struct llama_kv_cache : public llama_memory_i {
virtual ~llama_kv_cache() = default;
};
|