vq_encoder(text_features, feature_masks) if indices.ndim == 4 and indices.shape[1] == 1 and indices.shape[3] == 1: indices = indices[:, 0, :, 0] else: logger.error(f"Unknown indices shape: {indices.shape}") return encoded = model.encode(audios, audio_lengths) indices = ...