# for slice_size > 0 the attention score computation # is split across the batch axis to save memory # You can set slice_size with `set_attention_slice` self._slice_size = None self.to_q = nn.Linear(query_dim, inner_dim, bias=False) ...