>>> print(batch) {'input_ids': tensor([[ 101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [ 101, 1262, 1330, 5650, 102, 0, 0, 0, 0], [ 101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], ...
toke_type_ids主要作用还是区分上下文的吧,单句分类不加也无所谓吧
(bert_input) segment_label = np.array(segment_label) bert_label = np.array(bert_label) #is_next_label = np.array(is_next_label) output = {"input_ids": bert_input, "token_type_ids": segment_label, 'attention_mask': attention_mask, "bert_label": bert_label}#, is_next_label ...
padding=True, truncation=True, return_tensors="pt") >>> print(batch) {'input_ids': tensor([[ 101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [ 101, 1262, 1330, 5650, 102, 0, 0, 0, 0], [ 101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 0]]), 'token_type_ids':...