File "/tmp/pip-install-t51xid6r/flash-attn_f5a3e9f183ec423884f394ac30739e5f/setup.py", line 164, in raise RuntimeError( RuntimeError: FlashAttention is only supported on CUDA 11.7 and above. Note: make sure nvcc has a supported version by running nvcc -V. torch.__version__ = 2.4....
model_name_or_path, load_in_8bit=True, # xxx: int8 load in device_map=device_map, # xxx: int8 requires passing device_map torch_dtype=torch_dtype, trust_remote_code=True, ) else: if model_args.llama: model = LlamaForCausalLM.from_pretrained( model_args.model_name_or_path, torch_...