bestInfoGain=0;bestFeature=-1# 遍历每个特征,计算信息增益foriinrange(numFeatures):# 取出对应特征值,即一列数据 featList=[example[i]forexampleindataSet]uniqueVals=np.unique(featList)newEntropy=0forvalueinuniqueVals:subDataSet=splitData(dataSet,i,value)prob=len(subDataSet)/float(dataSet)newEntropy+...
featList = [example[i] for example in dataSet] uniqueVals = np.unique(featList) newEntropy = 0 for value in uniqueVals: subDataSet = splitData(dataSet, i, value) prob = len(subDataSet)/float(dataSet) newEntropy += prob * calcShannonEnt(subDataSet) # 计算信息增益G(Y, X) = H(Y)...
baseEntropy=calShannoEnt(dataSet) # 计算整个数据集的初始熵,这是决策前数据的不确定性度量 bestInfoGain=0.0 # 初始化最大信息增益为0 bestFeature=-1 # 初始化最佳特征的索引为-1,表示还未找到 for i in range(numFeatures): #对数据集中的每个特征进行循环 featList=[example[i] for example in dataSet...
bestFeatLabel=labels[bestFeat]#初始化树,用于存储树的结构,是很多字典的嵌套结构myTree ={bestFeatLabel: {}}#已经用过的特征删去del(labels[bestFeatLabel])#取出最优特征这一列的值featVals = [example[bestFeat]forexampleindataSet]#特征的取值个数uniqueVals =np.unique(featVals)#开始递归分裂forvaluein...
numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 ...
40 numFeatures = len(dataSet[0])-1 41 baseEntropy = calcShannonEnt(dataSet) # 原始的熵 42 bestInfoGain = 0 43 bestFeature = -1 44 for i in range(numFeatures): 45 featList = [example[i] for example in dataSet] 46 uniqueVals = set(featList) ...
featList=[example[i]forexampleindataSet] uniqueVals=set(featList) newEntropy=0.0 forvalueinuniqueVals: # 计算每种划分方式的信息熵,并对所有唯一特征值得到的熵求和 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) ...
numFeatures = len(dataSet[0])-1 #特征数baseEnt = calcShannonEnt(dataSet) # 总的信息嫡 baseInfoGain = 0.0 # 信息增益 bestFeature = -1 # 最好的特征的Index for i in range(numFeatures): featList = [example[i] for example in dataSet] # 获取该特征的所有特征值 ...
defchoose_best_feature_to_split(dataset): numFeatures =len(dataset[0]) -1# 数据集最后一列作为标签baseEntropy = compute_shannon_etropy(dataset) bestInfoGain =0.0bestFeature = -1foriinrange(numFeatures):# 遍历所有的特征featList = [example[i]forexampleindataset]# create a list of all the ...
foriinrange(10): list_1 = np.array(np.arange(1,10000)) list_1 = np.sin(list_1) print("使用Numpy用时{}s".format(time.time()-start)) 从如下运行结果,可以看到使用Numpy库的速度快于纯 Python 编写的代码: 使用纯Python用时0.0174443721771240...