config = transformers.AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=training_args.cache_dir, trust_remote_code=True, ) config.use_cache = False config.quantization_config.use_exllama = False config.quantization_config.disable_exllama = True # Load model and tokenizer mod...
use_exllama: return 0 return self.input_size_per_partition * self.output_size * 2 + 128 def temp_fwd_size(self, max_tokens): if not self.use_exllama: return 0 return self.output_size * max_tokens * 4 + 128 def scratch_space_fixed(self, max_tokens): if not self.use_exllama:...
exllama-runpod-serverless LLaMA GPTQ models with fast ExLlama inference on RunPod Serverless GPUs Summary This Docker image runs a Llama model on a serverless RunPod instance using the optimized turboderp's exllama repo. Set Up Create a RunPod account and navigate to the RunPod Serverless ...
ExLlama A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and memory-efficient on modern GPUs. Disclaimer: The project is coming along, but it's still a work in progress! Hardware requirements I am developing on an RTX 4090 and an...
ExLlama A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and memory-efficient on modern GPUs. Disclaimer: The project is coming along, but it's still a work in progress! Hardware requirements I am developing on an RTX 4090 and an...
ExLlama A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and memory-efficient on modern GPUs. Disclaimer: The project is coming along, but it's still a work in progress! Hardware requirements I am developing on an RTX 4090 and an...