items(), key=lambda x: x[1], reverse=True) self.unk, uniq_tokens = 0, ['UNK'] + reversed_token uniq_tokens += [token for token, freq in self.token_freq if freq > min_freq and token not in uniq_tokens] self.idx_to_token, self.token_to_idx = [], {} for token in uniq...
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " f"{vocab_size} - {expected_end_id}; got {actual_ids}")items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_dict = added_tokens ...
self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # The list of unique tokens self.idx_to_token = list( sorted( set(['<unk>'] + reserved_tokens + [ token for token, freq in self.token_freqs if freq >= min_freq]))) self.token_to_idx = { token...
415 + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " 416 + f"{vocab_size} - {expected_end_id}; got {actual_ids}") 367 417 368 418 items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) 369 419 se...
items(), key=lambda x: x[1], 524 + reverse=True) 529 525 # The index for the unknown token is 0 530 526 self.idx_to_token = ['<unk>'] + reserved_tokens 531 527 self.token_to_idx = { 532 528 token: idx for idx, token in enumerate(self.idx_to_token)} 533 - ...
self._add_word(token) def _add_word(self, word: str): if word not in self.word2idx: self.word2idx[word] = self.count self.word2count[word] = 1 self.idx2word[self.count] = word self.count += 1 else: self.word2count[word] += 1 def add_words(self, words: Sequence): for...
We read every piece of feedback, and take your input very seriously. Include my email address so I can be contacted Cancel Submit feedback Saved searches Use saved searches to filter your results more quickly Cancel Create saved search Sign in Sign up Reseting focus {...