Input Parameters: None Output: Output Format: Text Output Parameters: None Software Integration: Supported Hardware Platform(s): RTX 4090, Ada GPUs Supported Operating System(s): Windows Inference: TRT-LLM Inference EngineWindows Setup with TRT-LLM Test Hardware: RTX 4090...
num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks self.block_sliding_window = None if sliding_window is not None: asser...
I am trying to run your example (interactive_gen.py) but the pip installation is failing. pip install -r requirements.txt Collecting flash-attn==2.3.4 Using cached flash_attn-2.3.4.tar.gz (2.3 MB) Preparing metadata (setup.py) ... error error: subprocess-exited-with-error × python se...
undefined symbol: _ZN2at4_ops9_pad_enum4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEElNS5_8optionalIdEE Dao-AILab/flash-attention#836 In the end, flash-attn==2.5.7 is the newest version which still works given the torch versions necessary for vllm and without the TransformerEngine libr...
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" @@ -121,7 +122,7 @@ def zephyr_lora_files(): return snapshot_download(repo_id=LORA...
num_gpu_blocks: int, num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks self.block_sliding_window = None if sliding_win...
num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks self.block_sliding_window = None if sliding_window is not None: asser...
swap_space: int, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB self.sliding_window = sliding_window self._verify_args() # Will be set after profiling.41...
max_context_len: int, block_tables: torch.Tensor, sliding_window: Optional[int] = None, ) -> None: self.seq_groups = seq_groups self.seq_data = seq_data @@ -38,6 +39,24 @@ def __init__( self.max_context_len = max_context_len self.block_tables = block_tables self.to_cache...
num_gpu_blocks: int, num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks self.block_sliding_window = None if sliding_win...