nltk.classify.mallet.config_mallet(mallet_home)# Get the training & test corpus. We simplify the tagset a little:# just the first 2 chars.defstrip(corpus):return[[(w, t[:2])for(w,t)insent]forsentincorpus] brown_train = strip(brown.tagged_sents(categories='news')[:train_size]) brow...
Corpus.train_with_opennlp(self.train_file, self.model_file)defprint_performance(self):gold = [it.strip().split()[-1].split('|')foritinopen(self.test_file)] pred = [it.strip().split()[-1]foritinopen(self.predicted_file)] evaluate(gold, pred)defoutput_json_format(self, parse_pat...
defbrown_diversity():"""calculate and display lexical diversity score (token/token_type) for each brown corpus category"""cfd = nltk.ConditionalFreqDist((category, word)forcategoryinbrown.categories()forwordinbrown.words(categories=category))print"{0:15s} {1:10s}".format("CATEGORY","DIVERSITY")...
defload_movie_corpus_each_sentence(range):m = re.match(r'(\d+):(\d+)$', range)ifm: start = int(m.group(1)) end = int(m.group(2))fromnltk.corpusimportmovie_reviewsascorpusreturn[corpus.sents(fileid)forfileidincorpus.fileids()[start:end]] 开发者ID:zjusuyong,项目名称:multi_grai...