@@ -1087,9 +1087,9 @@ enum e_model {
10871087 MODEL_70B,
10881088};
10891089
1090- static const size_t kB = 1024 ;
1091- static const size_t MB = 1024 *kB ;
1092- static const size_t GB = 1024 *MB ;
1090+ static const size_t kiB = 1024 ;
1091+ static const size_t MiB = 1024 *kiB ;
1092+ static const size_t GiB = 1024 *MiB ;
10931093
10941094struct llama_hparams {
10951095 bool vocab_only;
@@ -1488,7 +1488,7 @@ static bool llama_kv_cache_init(
14881488 vram_kv_cache += ggml_nbytes (cache.k );
14891489 }
14901490 if (vram_kv_cache > 0 ) {
1491- LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB \n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1491+ LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MiB \n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
14921492 }
14931493 }
14941494#endif
@@ -2543,8 +2543,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
25432543 LLAMA_LOG_INFO (" %s: model type = %s\n " , __func__, llama_model_type_name (model.type ));
25442544 LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
25452545 LLAMA_LOG_INFO (" %s: model params = %.2f B\n " , __func__, ml.n_elements *1e-9 );
2546- if (ml.n_bytes < GB ) {
2547- LLAMA_LOG_INFO (" %s: model size = %.2f MiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 /1024.0 , ml.n_bytes *8.0 /ml.n_elements );
2546+ if (ml.n_bytes < GiB ) {
2547+ LLAMA_LOG_INFO (" %s: model size = %.2f MiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 /1024.0 , ml.n_bytes *8.0 /ml.n_elements );
25482548 } else {
25492549 LLAMA_LOG_INFO (" %s: model size = %.2f GiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 /1024.0 /1024.0 , ml.n_bytes *8.0 /ml.n_elements );
25502550 }
@@ -2582,7 +2582,7 @@ static void llm_load_tensors(
25822582
25832583 ml.calc_sizes (ctx_size, mmapped_size);
25842584
2585- LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MB \n " , __func__, ctx_size/1024.0 /1024.0 );
2585+ LLAMA_LOG_INFO (" %s: ggml ctx size = %7.2f MiB \n " , __func__, ctx_size/1024.0 /1024.0 );
25862586
25872587 // create the ggml context
25882588 {
@@ -3231,7 +3231,7 @@ static void llm_load_tensors(
32313231 ctx_size +
32323232 mmapped_size - vram_weights; // weights in VRAM not in memory
32333233
3234- LLAMA_LOG_INFO (" %s: mem required = %7.2f MB \n " , __func__, mem_required / 1024.0 / 1024.0 );
3234+ LLAMA_LOG_INFO (" %s: mem required = %7.2f MiB \n " , __func__, mem_required / 1024.0 / 1024.0 );
32353235
32363236#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
32373237 const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
@@ -3250,7 +3250,7 @@ static void llm_load_tensors(
32503250#endif // GGML_USE_CUBLAS
32513251
32523252 LLAMA_LOG_INFO (" %s: offloaded %d/%d layers to GPU\n " , __func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3253- LLAMA_LOG_INFO (" %s: VRAM used: %.2f MB \n " , __func__, vram_weights / 1024.0 / 1024.0 );
3253+ LLAMA_LOG_INFO (" %s: VRAM used: %.2f MiB \n " , __func__, vram_weights / 1024.0 / 1024.0 );
32543254#else
32553255 (void ) n_gpu_layers;
32563256#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7962,7 +7962,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79627962 workers.clear ();
79637963 }
79647964
7965- LLAMA_LOG_INFO (" size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
7965+ LLAMA_LOG_INFO (" size = %8.2f MiB -> %8.2f MiB | hist: " , ggml_nbytes (tensor)/1024.0 /1024.0 , new_size/1024.0 /1024.0 );
79667966 int64_t tot_count = 0 ;
79677967 for (size_t i = 0 ; i < hist_cur.size (); i++) {
79687968 hist_all[i] += hist_cur[i];
@@ -8502,7 +8502,7 @@ struct llama_context * llama_new_context_with_model(
85028502
85038503 {
85048504 const size_t memory_size = ggml_nbytes (ctx->kv_self .k ) + ggml_nbytes (ctx->kv_self .v );
8505- LLAMA_LOG_INFO (" %s: kv self size = %7.2f MB \n " , __func__, memory_size / 1024.0 / 1024.0 );
8505+ LLAMA_LOG_INFO (" %s: kv self size = %7.2f MiB \n " , __func__, memory_size / 1024.0 / 1024.0 );
85068506 }
85078507
85088508 // resized during inference
@@ -8547,7 +8547,7 @@ struct llama_context * llama_new_context_with_model(
85478547 // measure memory requirements for the graph
85488548 size_t alloc_size = ggml_allocr_alloc_graph (ctx->alloc , gf) + tensor_alignment;
85498549
8550- LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MB \n " , __func__, (ctx->buf_compute .size + alloc_size) / 1024.0 / 1024.0 );
8550+ LLAMA_LOG_INFO (" %s: compute buffer total size = %.2f MiB \n " , __func__, (ctx->buf_compute .size + alloc_size) / 1024.0 / 1024.0 );
85518551
85528552 // recreate allocator with exact memory requirements
85538553 ggml_allocr_free (ctx->alloc );
@@ -8561,7 +8561,7 @@ struct llama_context * llama_new_context_with_model(
85618561#endif
85628562#ifdef GGML_USE_CUBLAS
85638563 ggml_cuda_set_scratch_size (alloc_size);
8564- LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MB \n " , __func__, alloc_size / 1024.0 / 1024.0 );
8564+ LLAMA_LOG_INFO (" %s: VRAM scratch buffer: %.2f MiB \n " , __func__, alloc_size / 1024.0 / 1024.0 );
85658565
85668566 // calculate total VRAM usage
85678567 auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8581,10 +8581,10 @@ struct llama_context * llama_new_context_with_model(
85818581 size_t ctx_vram_size = alloc_size + kv_vram_size;
85828582 size_t total_vram_size = model_vram_size + ctx_vram_size;
85838583
8584- LLAMA_LOG_INFO (" %s: total VRAM used: %.2f MB (model: %.2f MB , context: %.2f MB )\n " , __func__,
8584+ LLAMA_LOG_INFO (" %s: total VRAM used: %.2f MiB (model: %.2f MiB , context: %.2f MiB )\n " , __func__,
85858585 total_vram_size / 1024.0 / 1024.0 ,
85868586 model_vram_size / 1024.0 / 1024.0 ,
8587- ctx_vram_size / 1024.0 / 1024.0 );
8587+ ctx_vram_size / 1024.0 / 1024.0 );
85888588#endif
85898589 }
85908590
@@ -8605,7 +8605,7 @@ struct llama_context * llama_new_context_with_model(
86058605
86068606 const size_t max_size = ggml_get_max_tensor_size (ctx->model .ctx );
86078607
8608- LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MB \n " , __func__, max_size/1024.0 /1024.0 );
8608+ LLAMA_LOG_INFO (" %s: max tensor size = %8.2f MiB \n " , __func__, max_size/1024.0 /1024.0 );
86098609
86108610#define LLAMA_METAL_CHECK_BUF (result ) \
86118611 if (!(result)) { \
0 commit comments