4 changes: 3 additions & 1 deletion 4 server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh Original file line numberDiff line numberDiff line change @@ -18,7 +18,7 @@ public: int height; int width; int groups; int groupsize; int gptq_groupsize; int rows_8; int rows_6...
ExLlamaV2DeviceTensors, ) def quant_post_init(model, max_input_length: Optional[int] = None): def quant_post_init(model, max_tokens: Optional[int] = None): """ The max_input_length argument is specific to the exllama backend, The max_tokens argument is specific to the exllama backe...