title_cut = jieba.lcut(line) title_s.append(title_cut) ''' 对title_s(list of list 格式)中的每个list的元素(str)进行过滤 剔除不需要的词语, 即 把停用词表stopwords中有的词语都剔除掉: ''' # 导入停用词表: stopwords = pd.read_excel('H:/stopwords.xlsx') stopwords = stopwords.stopword....
'r',encoding='utf-8').read() 9 text = text.replace('\n','').replace('\u3000','') 10 text_cut = jieba.lcut(text) 11 text_cut = ' '.join(text_cut) 12 13 #过滤一些没有关系
data_loader = DataLoader(args) tok = lambda x: jieba.lcut(x, cut_all=False) train_x, train_y, dev_x, dev_y, test_x = data_loader.get_dateSet(tok) train_gen = DataSetGenerator(train_x, train_y) train_dataset = mds.GeneratorDataset(train_gen, shuffle=True, column_names=['text',...