import math
def calShannonEnt(dataSet):
""" 计算信息熵 """
labelCountDict = {}
for d in dataSet:
label = d[-1]
if label not in labelCountDict.keys():
labelCountDict[label] = 1
else:
labelCountDict[label] += 1
entropy = 0.0
for l, c in labelCountDict.items():
p = 1.0 * c / len(dataSet)
entropy -= p * math.log(p, 2)
return entropy
def filterSubDataSet(dataSet, colIndex, value):
"""返回colIndex特征列label等于value,并且过滤掉改特征列的数据集"""
subDataSetList = []
for r in dataSet:
if r[colIndex] == value:
newR = r[:colIndex]
newR = np.append(newR, (r[colIndex + 1:]))
subDataSetList.append(newR)
return np.array(subDataSetList)
def chooseFeature(dataSet):
""" 通过计算信息增益选择最合适的特征"""
featureNum = dataSet.shape[1] - 1
entropy = calShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeatureIndex = -1
for i in range(featureNum):
uniqueValues = np.unique(dataSet[:, i])
condition_entropy = 0.0
for v in uniqueValues:#计算条件熵
subDataSet = filterSubDataSet(dataSet, i, v)
p = 1.0 * len(subDataSet) / len(dataSet)
condition_entropy += p * calShannonEnt(subDataSet)
infoGain = entropy - condition_entropy#计算信息增益
if infoGain = bestInfoGain:#选择最大信息增益
bestInfoGain = infoGain
bestFeatureIndex = i
return bestFeatureIndex
def creatDecisionTree(dataSet, featNames):
""" 通过训练集生成决策树 """
featureName = featNames[:]# 拷贝featNames,此处不能直接用赋值操作 , 否则新变量会指向旧变量的地址
classList = list(dataSet[:, -1])
if len(set(classList)) == 1:# 只有一个类别
return classList[0]
if dataSet.shape[1] == 1:#当所有特征属性都利用完仍然无法判断样本属于哪一类,此时归为该数据集中数量最多的那一类
return max(set(classList), key=classList.count)
bestFeatureIndex = chooseFeature(dataSet)#选择特征
bestFeatureName = featNames[bestFeatureIndex]
del featureName[bestFeatureIndex]#移除已选特征列
decisionTree = {bestFeatureName: {}}
featureValueUnique = sorted(set(dataSet[:, bestFeatureIndex]))#已选特征列所包含的类别, 通过递归生成决策树
for v in featureValueUnique:
copyFeatureName = featureName[:]
subDataSet = filterSubDataSet(dataSet, bestFeatureIndex, v)
decisionTree[bestFeatureName][v] = creatDecisionTree(subDataSet, copyFeatureName)
return decisionTree
def classify(decisionTree, featnames, featList):
""" 使用训练所得的决策树进行分类 """
classLabel = None
root = decisionTree.keys()[0]
firstGenDict = decisionTree[root]
featIndex = featnames.index(root)
for k in firstGenDict.keys():
if featList[featIndex] == k:
if isinstance(firstGenDict[k], dict):#若子节点仍是树 , 则递归查找
classLabel = classify(firstGenDict[k], featnames, featList)
else:
classLabel = firstGenDict[k]
return classLabel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
推荐阅读
- 鸿蒙系统审核已通过,鸿蒙系统审核通过了迟迟不推送
- 两个圆交叉圆的css样式,2个圆交叉求交叉部分面积
- pg查询重复数据,pg数据库查询重复数据
- 如何消除word段落阴影,如何消除word文档里的段落符号
- 自学vb.net从何入手 vb怎么自学
- 解字符串压缩c语言,c语言怎么输入字符串
- 视频号主播连线怎么连接,视频号直播怎么上链接卖货
- 嫩芽视频是什么,嫩芽视频是什么意思
- php数据请求方法 phpget请求