python信息熵函数 python 熵值法( 六 ) _Rou

""" 通过训练集生成决策树 """
featureName = featNames[:]# 拷贝featNames，此处不能直接用赋值操作，否则新变量会指向旧变量的地址
classList = list(dataSet[:, -1])
if len(set(classList)) == 1:# 只有一个类别
return classList[0]
if dataSet.shape[1] == 1:#当所有特征属性都利用完仍然无法判断样本属于哪一类，此时归为该数据集中数量最多的那一类
return max(set(classList), key=classList.count)
bestFeatureIndex = chooseFeature(dataSet)#选择特征
bestFeatureName = featNames[bestFeatureIndex]
del featureName[bestFeatureIndex]#移除已选特征列
decisionTree = {bestFeatureName: {}}
featureValueUnique = sorted(set(dataSet[:, bestFeatureIndex]))#已选特征列所包含的类别，通过递归生成决策树
for v in featureValueUnique:
copyFeatureName = featureName[:]
subDataSet = filterSubDataSet(dataSet, bestFeatureIndex, v)
decisionTree[bestFeatureName][v] = creatDecisionTree(subDataSet, copyFeatureName)
return decisionTree
def classify(decisionTree, featnames, featList):
""" 使用训练所得的决策树进行分类 """
classLabel = None
root = decisionTree.keys()[0]
firstGenDict = decisionTree[root]
featIndex = featnames.index(root)
for k in firstGenDict.keys():
if featList[featIndex] == k:
if isinstance(firstGenDict[k], dict):#若子节点仍是树，则递归查找
classLabel = classify(firstGenDict[k], featnames, featList)
else:
classLabel = firstGenDict[k]
return classLabel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
下面用鸢尾花数据集对该算法进行测试。由于ID3算法只能用于标称型数据，因此用在对连续型的数值数据上时，还需要对数据进行离散化，离散化的方法稍后说明，此处为了简化，先使用每一种特征所有连续性数值的中值作为分界点，小于中值的标记为1，大于中值的标记为0 。训练1000次，统计准确率均值。
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
data = https://www.04ip.com/post/np.c_[iris.data, iris.target]
scoreL = []
for i in range(1000):#对该过程进行10000次
trainData, testData = https://www.04ip.com/post/train_test_split(data)#区分测试集和训练集
featNames = iris.feature_names[:]
for i in range(trainData.shape[1] - 1):#对训练集每个特征，以中值为分界点进行离散化
splitPoint = np.mean(trainData[:, i])
featNames[i] = featNames[i]+'='+'{:.3f}'.format(splitPoint)
trainData[:, i] = [1 if x = splitPoint else 0for x in trainData[:, i]]
testData[:, i] = [1 if x = splitPoint else 0 for x in testData[:, i]]
decisionTree = creatDecisionTree(trainData, featNames)
classifyLable = [classify(decisionTree, featNames, td) for td in testData]
scoreL.append(1.0 * sum(classifyLable == testData[:, -1]) / len(classifyLable))
print 'score: ', np.mean(scoreL)
1
2
3
4
5
6
7
8
9

python信息熵函数 python 熵值法( 六 )

推荐阅读

松下空调故障代码c7是什么原因,不能单纯的简单对比

临床比较多见的白癜风症状有什么

微信天天领红包在哪里领 2018微信天天领红包怎么领取教程

多肉银手指怎么养胖银手指多肉植物怎么养成粗壮

瘦腿冠军白萝卜 6大蔬果击退赘肉！

正宗蜂蜜一般多少钱一斤正宗蜂蜜多少钱一斤

长安奔奔mini用了什么悬架型号长安奔奔mini用了什么悬架

孩子翻眼睛是什么原因

马文的战争演员表介绍马文的战争演员表

入门安卓开发要多久，零基础自学安卓开发需要多久

戴震难师文言文翻译戴震难师文言文翻译是什么

图片如何批量转换成pdf，图片怎么批量转换成pdf格式

黄芪什么时候喝最好

SAP|SAP UI5 数据绑定中的工厂函数

服务器怎么发布网站服务器怎么发彩色字

2023杭州元宵烟花灯光秀户外电子屏直播地址汇总

批量生成条形码工具,excel自动生成条形码公式

面膜多久敷一次面膜敷多久效果最好

本田幼兽为何不进入中国？

明可达台灯怎么样？推荐几款性价比高明可达台灯