llama_new_context_with_model: CUDA_Host output buffer size = 0.49 MiB llama_new_context_with_model: CUDA0 compute buffer size = 258.50 MiB llama_new_context_with_model: CUDA_Host compute buffer size = 9.01 MiB llama_new_context_with_model: graph nodes = 1030 llama_new_context_with_model...
raise ValueError("Failed to create llama_context") ValueError: Failed to create llama_context 期望行为 | Expected Behavior No response 运行环境 | Environment -OS:-NVIDIA Driver:-CUDA:-docker:-docker-compose:-NVIDIA GPU:-NVIDIA GPU Memory: ...
llama_new_context_with_model: n_ctx = 131072llama_new_context_with_model: n_batch = 2048llama_new_context_with_model: n_ubatch = 512llama_new_context_with_model: flash_attn = 0llama_new_context_with_model: freq_base = 500000.0llama_new_context_with_model: freq_scale = 1llama_kv_cac...
llama_new_context_with_model: kv self size = 256.00 MB ... ... llama_print_timings: sample time = 3.35 ms / 104 runs ( 0.03 ms per token, 31054.05 tokens per second) llama_print_timings: prompt eval time = 4593.10 ms / 54 tokens ( 85.06 ms per token, 11.76 tokens per second) ...
llama_context*ctx_eval = llama_new_context_with_model(llama_model_, ctx_eval_params); 创建ggml的线程池,这个过程可能和模型加速有关,代码中没有对它的详细解释: structggml_threadpool * threadpool = ggml_threadpool_new(&tpp); llama_attach_threadpool(ctx, threadpool, threadpool_batch); ...
llama_model_load_internal: n_embd = 4096 llama_model_load_internal: n_mult = 256 llama_model_load_internal: n_head = 32 llama_model_load_internal: n_head_kv = 32 llama_model_load_internal: n_layer = 32 llama_model_load_internal: n_rot = 128 ...
llama_new_context_with_model: n_ctx = 2048llama_new_context_with_model: freq_base = 10000.0llama_new_context_with_model: freq_scale = 1llama_new_context_with_model: kv self size = 1024.00 MBllama_new_context_with_model: compute buffer total size = 630.14 MBllama_new_context_with_model...
llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: freq_base = 10000.0 llama_new_context_with_model: freq_scale = 1 llama_kv_cache_init: CUDA_Host KV buffer size = 210.00 MiB llama_kv_cache_init: CUDA0 KV buffer size = 190.00 MiB ...
LLAMA_API struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_...
(hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=...