298 + gpt = GPT(**cfg, use_flash_attn=use_flash_attn, device=device, logger=self.logger).eval() 296 299 assert gpt_ckpt_path, "gpt_ckpt_path should not be None" 297 300 gpt.load_state_dict(torch.load(gpt_ckpt_path, weights_only=True, mmap=True)) ...
Add causal=True b6a1833 Enable when vllm_flash_attn da50678 Merge branch 'main' into flash-attention-decode 6d5b4ec Add vllm-flash-attn as dependency 37cb5a9 WoosukKwonadded2commitsMay 13, 2024 17:54 yapf 1be2eb3 Use fp32 in ref attn softmax ...
distri_config, use_flash_attn=True ) self.attention_pp_false = DistriSelfAttentionPP( self.attention, self.distri_config, use_flash_attn=False ) self.hidden_states = torch.rand( 1, self.sequence_length, self.hidden_dim, dtype=self.dtype, device=self.device, ) def test_flash_attn_true_...
compile:bool=True, coef:Optional[str]=None, use_flash_attn=False, ): ifdeviceisNone: device=select_device() Expand DownExpand Up@@ -292,7 +295,7 @@ def _load( ifgpt_config_path: cfg=OmegaConf.load(gpt_config_path) gpt=GPT(**cfg,device=device,logger=self.logger).eval() ...