self.is_dataset_tokenized = is_dataset_tokenized self.dataset = load_dataset(data_path, split="train", streaming=True) self.iterable_dataset = iter(self.dataset) self.buffer = torch.zeros(0, self.cfg.d_in, device=self.cfg.device) @@ -37,16 +40,16 @@ def get_batch_tokens(self): ...