python|《机器学习》西瓜书 算法代码 python实现


目录

  • 评估方法
    • 留出法
    • 交叉验证法
    • 自助法
  • 性能度量
  • 线性回归
    • 一元最小二乘回归
    • 多元最小二乘回归
  • 逻辑回归(对数几率回归)
    • 不包含最优化算法代码
    • 完整代码(无需导入其他文件)
  • LDA 线性判别分析
    • 仅计算投影方向的函数
    • LDA类
  • 决策树算法
    • ID3
  • 神经网络
    • 感知机
    • 多层前馈神经网络(标准BP算法)
  • 支持向量机
    • 解决书上的一些疑问
    • SMO算法
      • 推导闭式解步骤
      • 循环算法设计
    • 硬间隔支持向量机
    • 核函数思路
    • 软间隔思路
  • 聚类算法
    • k-Means
【python|《机器学习》西瓜书 算法代码 python实现】
以下代码均为本人原创,随本人学习进度和心情更新,欢迎讨论
其中涉及的基础最优化算法见《最优化方法》 算法代码 python实现
评估方法 此处代码不直接调用 sklearn.model_selection,且仅支持pandas库的DataFrame对象
代码都在 python3.7 下经本人测试过,可按自己需求修改
留出法
import pandas as pd import random # from numpy import randomdef train_test_split(X,Y=None,test_size=0.2): length = len(X) train_index = list(range(length)) test_index = random.sample(train_index,int(length*test_size)) for x in test_index: train_index.remove(x) if Y is not None: return X.iloc[train_index],X.iloc[test_index],Y.iloc[train_index],Y.iloc[test_index] else: return X.iloc[train_index],X.iloc[test_index]

交叉验证法
import pandas as pd import random # from numpy import randomdef cross_validation(X,Y=None,k=10): ''' 返回长度为k的列表 ''' folds_X = [] if Y is not None: folds_Y = [] split_index = [] length = len(X) tmp_index = list(range(length)) for i in range(k,0,-1): split_index.append(random.sample(tmp_index,int(length/i))) for x in split_index[-1]: tmp_index.remove(x) length -= 1 if Y is not None: for index in split_index: folds_X.append(X.iloc[index]) folds_Y.append(Y.iloc[index]) return folds_X,folds_Y else: for index in split_index: folds_X.append(X.iloc[index]) return folds_X

自助法
import pandas as pd import random # from numpy import randomdef bootstrapping(X,Y=None): length = len(X) train_index = [] test_index = list(range(length)) for i in range(length): tmp = random.randint(0,length-1) train_index.append(tmp) if tmp in test_index: test_index.remove(tmp) if not len(test_index): print('Warning: Bad luck is a horror!') if Y is not None: return X.iloc[train_index],X.iloc[test_index],Y.iloc[train_index],Y.iloc[test_index] else: return X.iloc[train_index],X.iloc[test_index]

性能度量 要将如下函数封装到类中时
请自行改动函数内的return语句为赋值给类的属性
def mean_squared_error(y,predict_y): ''' 回归 均方误差 ''' m = len(y) ans = 0 for i in range(m): ans += (y[i]-predict_y[i])**2 ans /= m return ansdef er(y,predict_y): ''' 分类 错误率 ''' m = len(y) ans = 0 for i in range(m): if y[i] != predict_y[i]: ans += 1 ans /= m return ansdef acc(y,predict_y): ''' 分类 精度 ''' m = len(y) ans = 0 for i in range(m): if y[i] == predict_y[i]: ans += 1 ans /= m return ansdef precision(y,predict_y,positive=1,negative=0): ''' 二分类 查准率: 预测是正例且实际上就是正例的比例 数值上等于ROC曲线中的假正例率 ''' a = 0 b = 0 for i in range(len(y)): if predict_y[i] == positive: b += 1 if y[i] == positive: a += 1 if b: return a/b else: return 0def recall(y,predict_y,positive=1,negative=0): ''' 二分类 召回率: 实际是正例且预测也是正例的比例 数值上等于ROC曲线中的真正例率 ''' a = 0 b = 0 for i in range(len(y)): if y[i] == positive: b += 1 if predict_y[i] == positive: a += 1 if b: return a/b else: return 0def F_beta(y,predict_y,positive=1,negative=0,beta=1): ''' 二分类 F_beta(default:F1) ''' a = 0 b1 = 0 b2 = 0 for i in range(len(y)): t1 = int(predict_y[i] == positive) t2 = int(y[i] == positive) b1 += t1 b2 += t2 a += t1&t2 if b1 == 0 or b2 == 0 or a == 0: return 0 p = a/b1 r = a/b2 return (1+beta**2)*p*r/(beta**2*p+r)

线性回归 一元最小二乘回归
import numpy as npdef least_square_method(X,Y): ''' 单变量最小二乘回归 X,Y 为一维数组/列表 返回浮点数 w,b ''' m = len(X) X = np.array(X) Y = np.array(Y) w = np.sum(Y*(X-np.average(X)))/(np.sum(X**2)-np.sum(X)**2/m) b = np.sum(Y-w*X)/m return w,bif __name__ == '__main__': w = 3 b = 1 x = [np.random.randint(1,10) for i in range(5)] y = [w*x[i]+b+np.random.randn()/10 for i in range(5)] w_p,b_p = least_square_method(x,y) print('w_predict = %.2f'%w_p) print('b_predict = %.2f'%b_p)

多元最小二乘回归
import numpy as np from numpy import randomdef multivariate_linear_regression(X,Y): ''' 多元最小二乘回归 要求样本数大于维数 Y可以是一维数组或二维数组 返回 array[w1, w2, ... , wd, b] ''' X = np.mat(X) Y = np.mat(Y) m = len(X) if len(Y) == 1: Y = Y.T X = np.c_[X,np.ones(m)] w = np.linalg.inv(X.T*X)*X.T*Y w = np.hstack(np.array(w)) return wif __name__ == '__main__': w = [random.randint(1,10) for i in range(5)] b = random.randint(-5,5) print('real w&b') print(*w,b,sep='\t') X = [] Y = [] for i in range(20): X.append([]) Y.append(b+random.random()*4-2) for j in range(5): X[-1].append(random.random()*20-10) Y[-1] += w[j]*X[-1][-1] wp = multivariate_linear_regression(X,Y) wp = map(lambda x:round(x,2),wp) print('predict w&b') print(*wp,sep='\t')

逻辑回归(对数几率回归) 看似逻辑回归只是线性回归加上一个特殊的联系函数(Sigmoid 函数)而已,实际上,因为训练的 tag 只可能是 0 和 1,而 0 和 1 在 Sigmoid 函数中对应的横坐标值为无穷,因此,我们无法用最小二乘法来拟合参数。
不过我们知道,这个世界上除了最小二乘法还有一个很伟大的回归算法叫极大似然法,如果把 sigmoid 函数看作求概率的方法(实际上数学上也可以证明 sigmoid 函数是一种什么概率模型来着的),我们就可以用求最大似然估计函数的最大值点,来拟合参数。
难点就是我们要自己写一个多元搜索算法(见本人文章最上方提到的最优化算法实现笔记)。
有了多元搜索算法,这个实现起来就简单了。
不包含最优化算法代码
import numpy as np import pandas as pd from numpy import log,exp from 多维直接搜索算法 import univarite_search_techniquedef logistic_regression(X,Y): ''' 逻辑回归 (对数几率回归) 要求样本数大于维数 Y可以是一维数组或二维数组(1为正例 0为反例) 返回 array[w1, w2, ... , wd, b] ''' X = np.mat(X) Y = np.mat(Y) m = len(X) if len(Y) == 1: Y = Y.T X = np.c_[X,np.ones(m)] def fun(*w): nonlocal X,Y w = np.matrix(w) return np.sum(-np.multiply(Y.T,w*X.T)+log(1+exp(w*X.T))) beta = univarite_search_technique(fun,X.shape[1]) return betaif __name__ == '__main__': from numpy import random w = [random.randint(1,10) for i in range(5)] b = random.randint(-5,5) print('real w&b') print(*w,b,sep='\t') X = [] Y = [] for i in range(20): X.append([]) Y.append(b+random.random()*4-2) for j in range(5): X[-1].append(random.random()*20-10) Y[-1] += w[j]*X[-1][-1] if Y[-1] > 0: Y[-1] = 1 else: Y[-1] = 0 wp = logistic_regression(X,Y) wp = list(map(lambda x:round(x,2),wp)) print('predict w&b') print(*wp,sep='\t')

完整代码(无需导入其他文件)
import numpy as np import pandas as pd from numpy import log,expdef success_failure_method(f,x0=0,h=5,maxiters=1000): ''' 成功-失败法求搜索区间 必须传入有最小值的严格凸函数 有可能搜索区间长度为0 代表输入的x0即为最优解 ''' y0 = f(x0) y1 = f(x0+h) while maxiters: if y0 > y1: y2 = f(x0+3*h) if y1 > y2: x0 = x0 + h y0 = y1 y1 = y2 h = 2*h maxiters -= 1 continue else: if h > 0: return x0,x0+3*h else: return x0+3*h,x0 elif y0 == y1: if h > 0: return x0,x0+h else: return x0+h,x0 else: h = -h/4 y1 = f(x0+h) maxiters -= 1 continue print('成功失败法传入了非凸函数') return 0,0def golden_section_method(f,a,b,acy=0.001,maxiters=1000): ''' 黄金分割法求最小值点 传入函数f 搜索区间[a, b] ''' alpha = 0.381966 beta = 1 - alpha x1 = a + alpha*(b-a) x2 = a + b - x1 y1 = f(x1) y2 = f(x2) while maxiters: if y1 > y2: a = x1 if b - a < acy: break else: x1 = x2 y1 = y2 x2 = alpha*a + beta*b y2 = f(x2) maxiters -= 1 continue else: b = x2 if b - a < acy: break else: x2 = x1 y2 = y1 x1 = beta*a + alpha*b y1 = f(x1) maxiters -= 1 continue else: print('黄金分割法未收敛到精度') return (a+b)/2def one_dimensional_search(f,x0=0,method='golden_section',acy=0.001,maxiters=1000,h=5): ''' 一维搜素算法 method可选择如下: golden_section ''' a,b = success_failure_method(f,x0=x0,h=h,maxiters=maxiters) if method == 'golden_section': ans = golden_section_method(f,a,b,acy=acy,maxiters=maxiters) else: print('Input method does not exist!') return None return ansdef univarite_search_technique(f,d,ini_point=None,acy=0.01,maxiters=1000): ''' 坐标轮换法 传入函数 维度 初始点 ''' if ini_point is None: point = [0 for i in range(d)] else: point = list(ini_point) difference = 1000 old = f(*point) while abs(difference) > acy and maxiters: for i in range(d): part1 = point[:i] part2 = point[i+1:] fun = lambda x:f(*part1,x,*part2) t = one_dimensional_search(fun,point[i]) point[i] = t new = f(*point) difference = new - old old = new maxiters -= 1 return pointdef logistic_regression(X,Y): ''' 逻辑回归 (对数几率回归) 要求样本数大于维数 Y可以是一维数组或二维数组(1为正例 0为反例) 返回 array[w1, w2, ... , wd, b] ''' X = np.mat(X) Y = np.mat(Y) m = len(X) if len(Y) == 1: Y = Y.T X = np.c_[X,np.ones(m)] def fun(*w): nonlocal X,Y w = np.matrix(w) return np.sum(-np.multiply(Y.T,w*X.T)+log(1+exp(w*X.T))) beta = univarite_search_technique(fun,X.shape[1]) return betaif __name__ == '__main__': from numpy import random w = [random.randint(1,10) for i in range(5)] b = random.randint(-5,5) print('real w&b') print(*w,b,sep='\t') X = [] Y = [] for i in range(20): X.append([]) Y.append(b+random.random()*4-2) for j in range(5): X[-1].append(random.random()*20-10) Y[-1] += w[j]*X[-1][-1] if Y[-1] > 0: Y[-1] = 1 else: Y[-1] = 0 wp = logistic_regression(X,Y) wp = list(map(lambda x:round(x,2),wp)) print('predict w&b') print(*wp,sep='\t')

测试结果,对比下生成数据的逻辑,和测试的参数成不成比例就可以说明算法正确性了。
LDA 线性判别分析 这部分因为不仅要实现参数 (投影方向) 的估计,还要实现预测功能,因此我先写了计算投影方向的函数,然后再写了LDA类。
仅计算投影方向的函数
import numpy as npdef linear_discriminant_analysis(X,Y): ''' LDA线性判别分析 要求样本数大于维数 Y可以是一维数组或二维数组(1为正例 0为反例) 返回 array[w1, w2, ... , wd] ''' n0 = [] n1 = [] X = np.mat(X) Y = np.mat(Y) m = len(X) if len(Y) == 1: Y = Y.T for i in range(m): if not int(Y[i]): n0.append(i) else: n1.append(i) X0 = X[n0] X1 = X[n1] miu0 = np.mean(X0,0) miu1 = np.mean(X1,0) w_scatter = 0 for i in range(len(X0)): w_scatter += (X0[i]-miu0).T*(X0[i]-miu0) for i in range(len(X1)): w_scatter += (X1[i]-miu1).T*(X1[i]-miu1) w = np.linalg.inv(w_scatter)*(miu0-miu1).T w = np.hstack(np.array(w)) return wif __name__ == '__main__': from numpy import random w = [random.randint(1,10) for i in range(5)] b = random.randint(-5,5) print('real w&b') print(*w,b,sep='\t') X = [] Y = [] for i in range(20): X.append([]) Y.append(b+random.random()*4-2) for j in range(5): X[-1].append(random.random()*20-10) Y[-1] += w[j]*X[-1][-1] if Y[-1] > 0: Y[-1] = 1 else: Y[-1] = 0 wp = linear_discriminant_analysis(X,Y) wp = list(map(lambda x:round(x,2),wp)) print('predict w') print(*wp,sep='\t')

LDA类
import numpy as npclass Linear_Discriminant_Analysis: def __init__(self): passdef train(self,X,Y): n0 = [] n1 = [] X = np.mat(X) Y = np.mat(Y) m = len(X) if len(Y) == 1: Y = Y.T for i in range(m): if not int(Y[i]): n0.append(i) else: n1.append(i) X0 = X[n0] X1 = X[n1] miu0 = np.mean(X0,0) miu1 = np.mean(X1,0) w_scatter = 0 for i in range(len(X0)): w_scatter += (X0[i]-miu0).T*(X0[i]-miu0) for i in range(len(X1)): w_scatter += (X1[i]-miu1).T*(X1[i]-miu1) self.w = np.linalg.inv(w_scatter)*(miu0-miu1).T self.d0 = self.w.T*miu0.T self.d1 = self.w.T*miu1.T def predict(self,x): x = np.mat(x) if len(x) == 1: x = x.T d = self.w.T*x if abs(d-self.d0) < abs(d-self.d1): return 0 else: return 1if __name__ == '__main__': from numpy import random w = [random.randint(1,10) for i in range(5)] b = random.randint(-5,5) X = [] Y = [] for i in range(20): X.append([]) Y.append(b+random.random()*4-2) for j in range(5): X[-1].append(random.random()*20-10) Y[-1] += w[j]*X[-1][-1] if Y[-1] > 0: Y[-1] = 1 else: Y[-1] = 0 model = Linear_Discriminant_Analysis() model.train(X,Y) print('real:') print(*Y,sep=', ') print('predict:') for x in X: y = model.predict(x) print(y,end=', ') print()

决策树算法 ID3 使用 python3.7(致命bug已于2022/1/4 22:22修复)
# -*- coding: utf-8 -*- from math import log import pandas as pdclass DecisionNode: ''' 若不为叶子结点 attribute: 当前判断属性 values: 属性所有可选值[value1,value2...] 若为叶子结点 attribute: 标签名 values: 该分支对应标签值value '''def __init__(self, attribute=None, values=None, isleaf=False): self.a = attribute self.values = values self.isleaf = isleafclass DecisionTree: def __init__(self, current=None): self.current = current self.nextTrees = {}def addnextTree(self, value, nextTree): self.nextTrees[value] = nextTreeclass ID3: def __init__(self): ''' 创建ID3学习器 '''def train(self, D): ''' D为DF对象 ''' self.decision_tree = TreeGenerate(D)def predict(self, D, inputtype='dict'): ''' D为DF对象或字典 ''' if inputtype == 'dict': testDict = D.copy() return findvalue(testDict, self.decision_tree)def show(self): printTree(self.decision_tree)def TreeGenerate(D): tag = D.columns[-1] if isAllSame(D[tag]): # 标签值全相同 node = DecisionNode(tag, D.loc[0][tag], isleaf=True) return DecisionTree(node) elif len(D.columns) == 1: # 属性集全空 正常结束 value = https://www.it610.com/article/getCountMax(D[tag]) node = DecisionNode(tag, value, isleaf=True) return DecisionTree(node) else: default = True for a in D.columns[:-1]: if not isAllSame(D[tag]): default = False break if default: # 属性值全相同 value = getCountMax(D[tag]) node = DecisionNode(tag, value, isleaf=True) return DecisionTree(node) else: a = choose(D) values = [] subsets = division(D, a) for key in subsets: values.append(key) node = DecisionNode(a, values.copy()) tree = DecisionTree(node) for key in subsets: tree.addnextTree(key, TreeGenerate(subsets[key])) return treedef findvalue(target, tree):''' target 为一个字典 ''' if tree.current.isleaf: return tree.current.values elif not target: print('无法判断') return 0 else: if tree.current.a in target: value = https://www.it610.com/article/target[tree.current.a] else: print('预测对象传入属性缺失') return 0 del target[tree.current.a] nexttree = tree.nextTrees[value] return findvalue(target, nexttree)def printTree(tree,startstr=''): s1 = str(tree.current.a) if tree.current.isleaf: print(startstr+s1+':'+str(tree.current.values)) else: for value in tree.nextTrees: print(startstr + s1 + ':' + str(value)) printTree(tree.nextTrees[value],startstr+'|\t') print(startstr)def Ent(D): ''' 传入一个DF对象 最后一列为标签列 返回其信息熵 ''' ent = 0 tmp = {} s = 0 tag = D.columns[-1] for d in D[tag]: if d in tmp: tmp[d] += 1 else: tmp[d] = 1 s += 1 for key in tmp: ent -= tmp[key] / s * log(tmp[key] / s, 2) return entdef division(D, a): ''' 传入DF对象和作为分割条件的属性 返回分割后的DF列表 ''' order = {}# a的各个属性值对应子DF columns_new = list(D.columns) columns_new.remove(a) for i in range(len(D)): if D.loc[i][a] in order: order[D.loc[i][a]].append(i) else: order[D.loc[i][a]] = [i] for key in order: order[key] = D.loc[order[key], columns_new] # 这一步一定要重置DF的索引 order[key].reset_index(inplace=True,drop=True) return orderdef Gain(D, a): ''' 传入一个DF对象 最后一列为标签列 返回其信息增益 ''' gain = Ent(D) subsets = division(D, a) for key in subsets: subset = subsets[key] gain -= len(subset) / len(D) * Ent(subset) return gaindef choose(D): ''' 选择信息增益最高的属性返回 ''' maxgain = -1 for a in D.columns[:-1]: gain = Gain(D, a) if gain > maxgain: maxgain = gain ans = a return ansdef isAllSame(L): tmp = [] for x in L: if x not in tmp: tmp.append(x) if len(tmp) == 1: return True else: return Falsedef getCountMax(L): tmp = {} count = 0 for x in L: if x in tmp: tmp[x] += 1 else: tmp[x] = 1 for x in tmp: if tmp[x] > count: count = tmp[x] ans = x return ansdef readexcel(filepath): ''' 要求传入的xlsx文件有表头(属性名) 无索引列 ''' df = pd.read_excel(filepath) return dfdef readcsv(filepath, sep=','): ''' 要求传入的xlsx文件有表头(属性名) 无索引列 df.columns[:-1] 返回其属性(不含标签)索引列表 ''' df = pd.read_csv(filepath, sep=sep) return dfif __name__ == '__main__': df = readcsv('wm2.0.csv') model = ID3() model.train(df) test = {'色泽':'青绿','根蒂':'稍蜷','敲声':'浊响', '纹理':'稍糊','脐部':'凹陷','触感':'硬滑'} model.show() ans = model.predict(test) print('-------以下为预测部分-------') print(ans)

wm2.0.csv 文件如下
色泽,根蒂,敲声,纹理,脐部,触感,好瓜 青绿,蜷缩,浊响,清晰,凹陷,硬滑,是 乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是 乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是 青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是 浅白,蜷缩,浊响,清晰,凹陷,硬滑,是 青绿,稍蜷,浊响,清晰,稍凹,软粘,是 乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是 乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是 乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否 青绿,硬挺,清脆,清晰,平坦,软粘,否 浅白,硬挺,清脆,模糊,平坦,硬滑,否 浅白,蜷缩,浊响,模糊,平坦,软粘,否 青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否 浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否 乌黑,稍蜷,浊响,清晰,稍凹,软粘,否 浅白,蜷缩,浊响,模糊,平坦,硬滑,否 青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否

神经网络
import numpy as np

感知机 神经网络的基础单位是感知机 (Perceptron),由输入层、输出层两层神经元组成
class Perceptron: def __init__(self, x_dimension, learning_rate=0.1): self.weight = np.ones(x_dimension+1) self.eta = learning_ratedef set_learning_rate(self, learning_rate=0.1): self.eta = learning_ratedef update(self,x,y): x = np.append(x,[-1]) y_ = sigmoid(np.sum(x*self.weight)) self.weight += self.eta*(y-y_)*xdef output(self,x): x = np.append(x,[-1]) y_ = sigmoid(np.sum(x*self.weight)) return y_def sigmoid(x): return 1/(1+np.exp(-x))

多层前馈神经网络(标准BP算法) 经过测试发现,多层神经网络确实可以实现异或,但这和创建网络时的各权重参数和阈值参数的初始值相关,否则会陷入局部极值的困境,需考虑其他方法解决。
以下代码需运行多遍才能训练成功,因此实际上正如书上所说,需结合启发式算法(退火、遗传)或在更新权值步骤加入随机参数进行训练。
此外本人有个猜想是,增大隐层数,是否能增大训练成功的概率,但暂未进行理论和实验证明。
class Perceptron: def __init__(self, x_dimension, learning_rate=0.1, israndom=True): if israndom: self.weight = np.random.random(x_dimension)*4-2 self.weight = np.append(self.weight,np.random.random(1)*2) else: self.weight = np.ones(x_dimension+1) self.eta = learning_ratedef set_learning_rate(self, learning_rate=0.1): self.eta = learning_ratedef set_weight(self,w): self.weight = wdef get_weight(self): return self.weightdef output(self,x): x = np.append(x,[-1]) y_ = sigmoid(np.sum(x*self.weight)) return y_class FNN: ''' 使用标准BP算法训练的多层前馈神经网络 ''' def __init__(self,input_size,hidden_size,output_size,eta=0.1): self.hidden_layer = [Perceptron(input_size) for i in range(hidden_size)] self.output_layer = [Perceptron(hidden_size) for i in range(output_size)] self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.eta = etadef predict(self,x): ''' x type: np.array ''' b = [] for p in self.hidden_layer: b.append(p.output(x)) b = np.array(b) y = [] for p in self.output_layer: y.append(p.output(b)) y = np.array(y) return ydef update_by_step(self,x,y): ''' 输入单个 x-y 进行更新 ''' b = [] for p in self.hidden_layer: b.append(p.output(x)) b = np.array(b) y_ = [] for p in self.output_layer: y_.append(p.output(b)) y_ = np.array(y_)# y的预测值 g = y_*(1-y_)*(y-y_) w_output = []# 用来保存输出层每个M-P神经元的权重参数 alter_w_output = [] for j in range(self.output_size): w_output.append(self.output_layer[j].get_weight()) alter_w_output.append(self.output_layer[j].get_weight()) alter_w_output += self.eta*g[j]*np.append(b,[-1]) self.output_layer[j].set_weight(alter_w_output[j]) w_output = np.array(w_output) e = b*(1-b)*np.sum(w_output.T*g,0) alter_w_hidden = [] for i in range(self.hidden_size): alter_w_hidden.append(self.hidden_layer[i].get_weight()) alter_w_hidden[-1] += self.eta*e[i]*np.append(x,[-1]) self.hidden_layer[i].set_weight(alter_w_hidden[i])def sigmoid(x): return 1/(1+np.exp(-x))if __name__ == '__main__': network = FNN(2,2,1) data_x = [[1,1],[1,0],[0,1],[0,0]] data_y = [1,0,0,1] for i in range(1000): for j in range(4): network.update_by_step(data_x[j],data_y[j]) for i in range(4): print(network.predict(data_x[i]))

支持向量机 解决书上的一些疑问
  1. 书上公式 (6.3) 中,假设了支持向量到超平面的距离为1,是为了方便后面推导公式,左侧注释部分有提到这里省略了放缩步骤。
  2. 书上公式 (6.11) 的推导是先推到
    max ? α ∑ i = 1 m α i ? 1 2 w T w \max_\alpha{\sum_{i=1}^m{\alpha_i}-\frac{1}{2}w^Tw} αmax?i=1∑m?αi??21?wTw
    再代入式 (6.9) 得到的。
  3. 使用SMO算法求出α \alpha α 后,代回式 (6.9) 即可求出w w w。
SMO算法 推导闭式解步骤
先推导下书上SMO算法中提到的闭式解,记 m 为样本数。
先假设
Q = { 1 , 2 , . . . , m } ? { i , j } C = ? ∑ k ∈ Q α k y k Q = \{1, 2, ... , m\}-\{i, j\} \\ C = -\sum_{k\in Q}{\alpha_ky_k} Q={1,2,...,m}?{i,j}C=?k∈Q∑?αk?yk?
把书上提到的对偶问题中目标变量的常数部分删去,得到
m a x α i + α j ? ( α i y i x i T + α j y j x j T ) ∑ k ∈ Q α k y k x k ? 1 2 ( α i y i x i T + α j y j x j T ) ( α i y i x i + α j y j x j ) s . t . α i y i + α j y j = c \begin{aligned} &max&&\alpha_i+\alpha_j-(\alpha_iy_ix_i^T+\alpha_jy_jx_j^T)\sum_{k\in Q}{\alpha_ky_kx_k}-\frac{1}{2}(\alpha_iy_ix_i^T+\alpha_jy_jx_j^T)(\alpha_iy_ix_i+\alpha_jy_jx_j) \\ &s.t.&&\alpha_iy_i+\alpha_jy_j=c \end{aligned} ?maxs.t.??αi?+αj??(αi?yi?xiT?+αj?yj?xjT?)k∈Q∑?αk?yk?xk??21?(αi?yi?xiT?+αj?yj?xjT?)(αi?yi?xi?+αj?yj?xj?)αi?yi?+αj?yj?=c?
使用拉格朗日法后,对α i \alpha_i αi? 和α j \alpha_j αj? 求导后得到的式子为
1 ? y i x i T ∑ k ∈ Q α k y k x k ? y i x i T ? α j y j x j ? α i y i 2 x i T x i + μ y i = 0 1 ? y j x j T ∑ k ∈ Q α k y k x k ? y j x j T ? α i y i x i ? α j y j 2 x j T x j + μ y j = 0 1-y_ix_i^T\sum_{k\in Q}{\alpha_ky_kx_k}-y_ix_i^T\cdot \alpha_jy_jx_j-\alpha_iy_i^2x_i^Tx_i+\mu y_i=0 \\ 1-y_jx_j^T\sum_{k\in Q}{\alpha_ky_kx_k}-y_jx_j^T\cdot \alpha_iy_ix_i-\alpha_jy_j^2x_j^Tx_j+\mu y_j=0 1?yi?xiT?k∈Q∑?αk?yk?xk??yi?xiT??αj?yj?xj??αi?yi2?xiT?xi?+μyi?=01?yj?xjT?k∈Q∑?αk?yk?xk??yj?xjT??αi?yi?xi??αj?yj2?xjT?xj?+μyj?=0
消μ \mu μ 后
( y j ? y i ) ? ( x i T ? x j T ) y i y j ∑ k ∈ Q α k y k x k + ( α i y i ? α j y j ) y i y j x i T x j ? α i y i ? y i y j ? x i T x i + α j y j ? y i y j ? x j T x j = 0 (y_j-y_i)-(x_i^T-x_j^T)y_iy_j\sum_{k\in Q}{\alpha_ky_kx_k}+(\alpha_iy_i-\alpha_jy_j)y_iy_jx_i^Tx_j-\alpha_iy_i\cdot y_iy_j\cdot x_i^Tx_i+\alpha_jy_j\cdot y_iy_j\cdot x_j^Tx_j=0 (yj??yi?)?(xiT??xjT?)yi?yj?k∈Q∑?αk?yk?xk?+(αi?yi??αj?yj?)yi?yj?xiT?xj??αi?yi??yi?yj??xiT?xi?+αj?yj??yi?yj??xjT?xj?=0
最后把式子α i y i + α j y j = C \alpha_iy_i+\alpha_jy_j=C αi?yi?+αj?yj?=C 代入即可解得α i \alpha_i αi? 和α j \alpha_j αj?
α i = ? ( y i ? y j ) ? ( x i T ? x j T ) y i y j ∑ k ∈ Q α k y k x k ? ( x i T ? x j T ) ? C y i y j ? x j y i ? y i y j ( x i T ? x j T ) ( x i ? x j ) α j = ? ( y j ? y i ) ? ( x j T ? x i T ) y i y j ∑ k ∈ Q α k y k x k ? ( x j T ? x i T ) ? C y i y j ? x i y j ? y i y j ( x i T ? x j T ) ( x i ? x j ) \alpha_i=\frac{-(y_i-y_j)-(x_i^T-x_j^T)y_iy_j\sum_{k\in Q}{\alpha_ky_kx_k}-(x_i^T-x_j^T)\cdot Cy_iy_j\cdot x_j}{y_i\cdot y_iy_j(x_i^T-x_j^T)(x_i-x_j)} \\ \alpha_j=\frac{-(y_j-y_i)-(x_j^T-x_i^T)y_iy_j\sum_{k\in Q}{\alpha_ky_kx_k}-(x_j^T-x_i^T)\cdot Cy_iy_j\cdot x_i}{y_j\cdot y_iy_j(x_i^T-x_j^T)(x_i-x_j)} αi?=yi??yi?yj?(xiT??xjT?)(xi??xj?)?(yi??yj?)?(xiT??xjT?)yi?yj?∑k∈Q?αk?yk?xk??(xiT??xjT?)?Cyi?yj??xj??αj?=yj??yi?yj?(xiT??xjT?)(xi??xj?)?(yj??yi?)?(xjT??xiT?)yi?yj?∑k∈Q?αk?yk?xk??(xjT??xiT?)?Cyi?yj??xi??
循环算法设计
可恶,上面闭式解计算的时候,把分子最前面的负号忘记了,导致我花了十多个小时去改进我的算法。发现算法已经调整得极其完美了还是Error,然后又花了几个小时,数学证明了下书上的算法可以得到最优解,最后又花了几个小时,将错误的结果一步步反推总算发现我公式里的这个问题了。先介绍下我做出的算法的改进吧。
  1. 设计了复杂度相对较低且高效的初始化参数步骤。
  2. 摒弃书上用于更新偏移量b的方法,采用新的方法以确保无法确定支持向量时,仍然能计算出合适的偏移量。
  3. 关于SMO算法中选取两个变量的方法,以书上的思路为基础,设计了较高效的方法。
  4. 虽然目前算法不支持核函数和软间隔,但是个人认为还是很好拓展的,以后再做(doge)。
硬间隔支持向量机 predict 方法暂时就不写了,反正得到w w w 和b b b 了(太怠惰了)。
import numpy as np import matplotlib.pyplot as pltclass SVM: def __init__(self,x_dimension): self.w = np.zeros(x_dimension) self.b = 0def train(self,X,Y,acy=0.01,maxiters=1000): ''' X type: list with np.array Y type: list with value(1 or -1) ''' # step1 初始化 m = len(Y) for i in range(m): self.w += X[i]*Y[i] self.w = self.w/m # 本人决定在初始化b多花点时间 self.initialize_b(X,Y) # test! print('初始化w,b为:',end=' ') print(self.w,self.b) # self.alphas = [1/m for i in range(m)] self.supports = [] # step2 SMO while maxiters: oldw = self.w oldb = self.b self.sequential_minimal_optimization(X,Y) alter = np.sum(np.abs(self.w-oldw))+abs(self.b-oldb) if alter < acy: break maxiters -= 1 else: print('Train Failed') if np.sum(self.alphas) == 0: print('Train Failed') else: print('Train Success')def initialize_b(self,X,Y): m = len(Y) set1 = [] set2 = [] for i in range(m): if Y[i] == 1: set1.append(np.sum(self.w*X[i])) else: set2.append(np.sum(self.w*X[i])) margin = [min(set1),max(set1),min(set2),max(set2)] margin.sort() self.b = -(margin[1]+margin[2])/2def sequential_minimal_optimization(self,X,Y): zero = 0.000001 m = len(Y) # step1 确定第一个变量更新顺序 outcirculation = [] other1 = [] other2 = [] other3 = [] fun = [Y[k]*(np.sum(self.w*X[k])-self.b)-1 for k in range(m)] for i in range(m): if self.alphas[i] > zero: if fun[i] < -zero: outcirculation.append(i) elif fun[i] > zero: other1.append(i) else: other3.append(i) else: if fun[i] < -zero: other2.append(i) else: other3.append(i) outcirculation.sort(key=lambda x:fun[x]) other1.sort(key=lambda x:fun[x],reverse=True) other2.sort(key=lambda x:fun[x]) outcirculation += other1 outcirculation += other2 #outcirculation += other3 del other1,other2,other3,fun # step2-4 循环 incirculation = list(range(m)) for first in outcirculation: # step2 确定与第一个变量对应的第二个变量 fun = [np.sum(self.w*X[k]) for k in range(m)] maxvalue = https://www.it610.com/article/0 second = first for i in incirculation: value = abs(fun[i]-fun[first]) if Y[first]*Y[i] < 0 and value> maxvalue: maxvalue = https://www.it610.com/article/value second = i # test!'''print('选择的两个更新目标为:',end=' ') print(first,second)''' # # step3 计算更新alpha if first != second: new1,new2 = self.update_alpha(X,Y,first,second) self.alphas[first] = new1 self.alphas[second] = new2 # step4 更新w neww = 0 newsupport = list(range(m)) for i in range(m): neww = neww + self.alphas[i]*Y[i]*X[i] if self.alphas[i] == 0: newsupport.remove(i) self.w = neww # step5 更新b self.initialize_b(X,Y) # test! print('更新后w,b为:',end=' ') print(self.w,self.b) #print(self.alphas) #def update_alpha(self,X,Y,i,j): ''' i,j from 0, 1, ... , m-1 ''' Q = list(range(len(Y))) Q.remove(i) Q.remove(j) q1 = Y[i]-Y[j] c1 = (X[i]-X[j])*Y[i]*Y[j] c2 = 0 c3 = 0 for k in Q: c2 = c2 + self.alphas[k]*Y[k]*X[k] c3 = c3 - self.alphas[k]*Y[k] q2 = np.sum(c1*c2) q3 = c1*c3 p1 = np.sum(c1*(X[i]-X[j])) alpha1 = max((-q1-q2-np.sum(q3*X[j]))/Y[i]/p1,0) alpha2 = max((-q2-q1+np.sum(q3*X[i]))/Y[j]/p1,0) return alpha1,alpha2if __name__ == '__main__': data_x = [] data_y = [] w1 = 0.6 w2 = 0.4 for i in range(50): x1 = np.random.randint(-20,40) x2 = np.random.randint(0,60) f = w1*x1+w2*x2 if f > 20: data_x.append(np.array([x1,x2])) data_y.append(1) elif f < 16: data_x.append(np.array([x1,x2])) data_y.append(-1) model = SVM(2) model.train(data_x,data_y,maxiters=100) fig = plt.figure() for i in range(len(data_x)): if data_y[i] == 1: plt.scatter(*data_x[i],c='r') else: plt.scatter(*data_x[i],c='g') xx = np.linspace(-20,40,1000) yy = (-model.b-model.w[0]*xx)/model.w[1] plt.plot(xx,yy) plt.show()

核函数思路 本人先写以下实现核函数的思路,日后有空再补完代码。
所谓核函数,最原本的概念都能理解,推导到最后就是将原本两个向量求内积,也就是x i T x j x_i^Tx_j xiT?xj? 拓展成κ ( x i , x j ) \kappa(x_i,x_j) κ(xi?,xj?),像原本的x i T x j x_i^Tx_j xiT?xj? 就被称为线性核。我们在推导SMO算法闭式解的时候,很容易发现, x i x_i xi? 就是传入的样本数据,就是个常数,因此核函数的引入,对算法本身影响是不大的。当然要注意的是,不仅仅是SMO算法求解两个变量的过程中,更新w w w、 b b b 两个参数的时候也要用这个核函数。
线性核本人代码的实现方法就是两个 np.array 对象相乘,再用 np.sum 函数求和,我们可以把原本代码中的 np.sum() 语句都替换成自定义的 kappa() 函数,传入个参数代表选择什么核就可以了。
def kappa(x1,x2,method='line'): ''' input type: np.array ''' if method == 'line': return np.sum(x1*x2) elif ...: ... ...

软间隔思路 软间隔可能要比核函数麻烦一点,根据书上的推导步骤(本人目前还没推导),推导到最后是在原本对偶问题的基础上,增加了一个限制条件α i ≤ C ( i = 1 , 2 , . . . , m ) \alpha_i\le C(i=1,2,...,m) αi?≤C(i=1,2,...,m)。并且KKT条件也有一些变化,这样改动的地方就有点多了,包括如何选择两个变量的步骤、求解更新后的变量需要增加判断条件等,尤其是本人原本写的不需要考虑支持向量就能更新b b b 的算法随着支持向量定义的改变,需要重新设计了。
聚类算法 k-Means
# -*- coding: utf-8 -*-from math import sqrt import random import pandas as pd import matplotlib.pyplot as pltclass kMeans: def __init__(self,k=3): self.k = k self.clusts = []# 存储每个簇均值 self.result = []# 存储当前聚类结果 self.count = 0 def train(self,D,iter_max=100): self.clusts = initialize(D,self.k) self.clusts,result = update(self.clusts,D,self.k) count = 1 for count in range(2,iter_max+1): self.clusts,result_n = update(self.clusts,D,self.k) if result_n == result:break result = result_n self.result = result self.count = count self.data = https://www.it610.com/article/D def predict(self,D): pass def show(self,is2d=False): if is2d: colorlist = ['b','r','g','y','pink','black','darkmagenta'] fig = plt.figure() for i in range(self.k): plt.scatter(self.data.loc[self.result[i]][self.data.columns[0]], self.data.loc[self.result[i]][self.data.columns[1]], c=colorlist[i]) plt.show() else: for clust in self.result: print(clust) print('迭代次数:%d'%(self.count))def dist(x,y,dtype='euclidean'): ''' 计算两向量间的距离 要求x y长度相同 ''' ans = 0 if dtype == 'euclidean': for i in range(len(x)): ans += (y[i]-x[i])**2 ans = sqrt(ans) elif dtype == 'manhattan': for i in range(len(x)): ans += abs(y[i]-x[i]) return ansdef initialize(remain,k): ''' 初始化簇 返回初始簇 remain 代表剩余样本 DaraFrame对象 ''' selects = random.sample(range(len(remain)),k) clusts = [[remain.loc[j][i] for i in remain.columns] for j in selects] ''' remain = remain.drop(selects) remain.reset_index(inplace=True,drop=True) ''' return clustsdef update(clusts,df,k): ''' 传入 当前每个簇的均值向量 和 训练集 返回更新后的均值向量和聚类二维列表 ''' result = [[] for i in range(k)] new = [] dimension = len(df.columns) for i in range(len(df)): n = choose(list(df.loc[i]),clusts,k) result[n].append(i) for i in range(k): new.append([0]*dimension) for j in range(len(result[i])): for x in range(dimension): new[i][x] += df.loc[result[i][j]][df.columns[x]] for x in range(dimension): new[i][x] /= j+1 return new,resultdef choose(single,clusts,k): ''' 返回选择的簇在clusts中的序号 ''' mind = dist(single,clusts[0]) n = 0 for i in range(1,k): d = dist(single,clusts[i]) if d < mind: mind = d n = i return nif __name__ == '__main__': df = pd.read_csv('wm.4.0.csv') model = kMeans(2) model.train(df) model.show() model.show(is2d=True)

wm4.0.csv 文件如下`
密度,含糖率 0.697,0.460 0.403,0.237 0.245,0.057 0.593,0.042 0.748,0.232 0.751,0.489 0.774,0.376 0.481,0.149 0.343,0.099 0.719,0.103 0.714,0.346 0.532,0.472 0.634,0.264 0.437,0.211 0.639,0.161 0.359,0.188 0.483,0.312 0.473,0.376 0.608,0.318 0.666,0.091 0.657,0.198 0.339,0.241 0.478,0.437 0.725,0.445 0.556,0.215 0.243,0.267 0.360,0.370 0.282,0.257 0.525,0.369 0.446,0.459

    推荐阅读