python信息熵函数 python 熵值法( 六 )


""" 通过训练集生成决策树 """
featureName = featNames[:]# 拷贝featNames,此处不能直接用赋值操作,否则新变量会指向旧变量的地址
classList = list(dataSet[:, -1])
if len(set(classList)) == 1:# 只有一个类别
return classList[0]
if dataSet.shape[1] == 1:#当所有特征属性都利用完仍然无法判断样本属于哪一类,此时归为该数据集中数量最多的那一类
return max(set(classList), key=classList.count)
bestFeatureIndex = chooseFeature(dataSet)#选择特征
bestFeatureName = featNames[bestFeatureIndex]
del featureName[bestFeatureIndex]#移除已选特征列
decisionTree = {bestFeatureName: {}}
featureValueUnique = sorted(set(dataSet[:, bestFeatureIndex]))#已选特征列所包含的类别 ,  通过递归生成决策树
for v in featureValueUnique:
copyFeatureName = featureName[:]
subDataSet = filterSubDataSet(dataSet, bestFeatureIndex, v)
decisionTree[bestFeatureName][v] = creatDecisionTree(subDataSet, copyFeatureName)
return decisionTree
def classify(decisionTree, featnames, featList):
""" 使用训练所得的决策树进行分类 """
classLabel = None
root = decisionTree.keys()[0]
firstGenDict = decisionTree[root]
featIndex = featnames.index(root)
for k in firstGenDict.keys():
if featList[featIndex] == k:
if isinstance(firstGenDict[k], dict):#若子节点仍是树,则递归查找
classLabel = classify(firstGenDict[k], featnames, featList)
else:
classLabel = firstGenDict[k]
return classLabel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
下面用鸢尾花数据集对该算法进行测试 。由于ID3算法只能用于标称型数据,因此用在对连续型的数值数据上时,还需要对数据进行离散化,离散化的方法稍后说明,此处为了简化,先使用每一种特征所有连续性数值的中值作为分界点,小于中值的标记为1,大于中值的标记为0 。训练1000次,统计准确率均值 。
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
data = https://www.04ip.com/post/np.c_[iris.data, iris.target]
scoreL = []
for i in range(1000):#对该过程进行10000次
trainData, testData = https://www.04ip.com/post/train_test_split(data)#区分测试集和训练集
featNames = iris.feature_names[:]
for i in range(trainData.shape[1] - 1):#对训练集每个特征,以中值为分界点进行离散化
splitPoint = np.mean(trainData[:, i])
featNames[i] = featNames[i]+'='+'{:.3f}'.format(splitPoint)
trainData[:, i] = [1 if x = splitPoint else 0for x in trainData[:, i]]
testData[:, i] = [1 if x = splitPoint else 0 for x in testData[:, i]]
decisionTree = creatDecisionTree(trainData, featNames)
classifyLable = [classify(decisionTree, featNames, td) for td in testData]
scoreL.append(1.0 * sum(classifyLable == testData[:, -1]) / len(classifyLable))
print 'score: ', np.mean(scoreL)
1
2
3
4
5
6
7
8
9

推荐阅读