Pythonclass LlamaRotaryEmbedding(torch.nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base inv_freq = 1.0 / (self.bas...
classLlamaRotaryEmbedding(nn.Module):def__init__(self,dim,max_position_embeddings=2048,base=10000,device=None,scaling_factor=1.0):super().__init__()self.scaling_factor=scaling_factorself.dim=dimself.max_position_embeddings=max_position_embeddingsself.base=baseinv_freq=1.0/(self.base**(torch....
# Build here to make `torch.jit.trace` work. self.max_seq_len_cached = max_position_embeddings t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it ...
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype ) def _set_cos_sin_cache(self, seq_len, device, dtype): self.max_seq_len_cached = seq_len t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) freqs = torch...
"max_position_embeddings": 8192,比2大 2只有2k说明句子长度变成8k了. "torch_dtype": "bfloat16", 2代用的32位来存储的. 说明16位模型是更好的效率的方案. "transformers_version": "4.38.2", 需要的transformers的版本也更高了. "vocab_size": 128256 #添加了大量的vocab_size 之前只有3w2 ...
max_position_embeddings=self.max_position_embeddings) else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] if scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_pos...
self.max_seq_len_cached=max_position_embeddings t= torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) freqs= torch.einsum("i,j->ij", t, self.inv_freq) # Differentfrompaper, but it uses a different permutation ...
position_embeddings t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, ...
classLlamaRotaryEmbedding(torch.nn.Module):def__init__(self,dim,max_position_embeddings=2048,base=10000,device=None):super().__init__()inv_freq=1.0/(base**(torch.arange(0,dim,2).float().to(device)/dim))self.register_buffer("inv_freq",inv_freq)# Build here to make `torch.jit.trac...
'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 16, 'num_hidden_layers': 4, 'num_key_value_heads': 8, 'pretraining_tp': 1, 'rms_norm_eps': 1e-06, 'rope_scaling': null, 'rope_theta': 10000.0, ...