mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-02 09:10:21 +00:00
shadow : cont gcc
ggml-ci
This commit is contained in:
+6
-7
@@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m
|
||||
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
||||
if (ggml_backend_split_buffer_type_fn) {
|
||||
size_t dev_index = [&]() {
|
||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
|
||||
if (ggml_backend_reg_dev_get(reg, i) == dev) {
|
||||
ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) {
|
||||
if (ggml_backend_reg_dev_get(reg_dev, i) == dev) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
@@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||
return { cpu_dev, &pimpl->cpu_buft_list };
|
||||
}
|
||||
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
auto * dev = devices.at(layer_gpu);
|
||||
@@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
@@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
||||
|
||||
const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
||||
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
||||
return it.first == name;
|
||||
[name](const std::pair<std::string, struct ggml_tensor *> & entry) {
|
||||
return entry.first == name;
|
||||
});
|
||||
if (it == tensors_by_name.end()) {
|
||||
return nullptr;
|
||||
|
||||
+4
-4
@@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
||||
};
|
||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) {
|
||||
if (n_expert > 1) {
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||
// tensor name.
|
||||
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
||||
if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) {
|
||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer));
|
||||
}
|
||||
if (i_layer < 0 || i_layer >= n_layer) {
|
||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer));
|
||||
}
|
||||
}
|
||||
return std::make_pair(i_layer, n_layer);
|
||||
|
||||
+4
-4
@@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||
|
||||
// copy piece chars to output text buffer
|
||||
// skip up to 'lstrip' leading spaces before copying
|
||||
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
||||
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
||||
token++;
|
||||
auto _try_copy = [=] (const char * text, size_t size) -> int32_t {
|
||||
for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) {
|
||||
text++;
|
||||
size--;
|
||||
}
|
||||
if (length < (int32_t)size) {
|
||||
return -(int32_t) size;
|
||||
}
|
||||
memcpy(buf, token, size);
|
||||
memcpy(buf, text, size);
|
||||
return (int32_t) size;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user