机器学习|机器学习python代码

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档

文章目录

  • 前言
  • 一、线性回归
    • 1.1 单特征的线性回归
    • 1.2 多特征的线性回归
    • 1.3 正规方程
  • 二、logistic 回归
    • 2.1 plot data
    • 2.2 单特征的logistic回归
    • 2.3 多特征的logistic回归
  • 三、Neural Network
    • 3.1 plot data
    • 3.2 Neural Network
  • 四、交叉验证误差
  • 五、SVM大间距分类器
    • 5.1 线性svm
    • 5.2 非线性SVM
  • 六、K-means
    • 6.1 K-Means应用
    • 6.2 K-Means应用——图片压缩
  • 七、PCA 主成分分析
    • 7.1 PCA
    • 7.2 PCA 应用 ——图片压缩
  • 八、异常检测

前言 此篇博客主要记录使用python来完成机器学习的某些算法,例如线性回归,逻辑回归,k-mean等,本文参考了吴恩达老师的机器学习课程,详细资料可见Coursera网站机器学习课程。
提示:以下是本篇文章正文内容,下面案例可供参考
一、线性回归 1.1 单特征的线性回归
import numpy as np import pandas as pd import matplotlib.pyplot as plt## 计算代价 def computeCost(X,y,theta): m=len(X) J=np.sum(np.power(X*theta-y,2))/(2*m) return J## 梯度下降 def gradientDescent(X,y,theta,alpha,iters): m=len(X) cost=np.zeros(iters) for i in range(iters): temp=((X * theta - y).T * X * alpha / m).T theta=np.subtract(theta,temp) cost[i]=computeCost(X,y,theta) return theta,costpath=r'E:\python\MyMachineLearning\exp1\ex1data1.txt' data=https://www.it610.com/article/pd.read_csv(path,header=None,names=['Population','Profit']) ## 插入x0 data.insert(0,'ones',1) cols=data.shape[1] X=data.iloc[:,:-1] y=data.iloc[:,cols-1:cols] ## 转为矩阵便于后面的矩阵计算 X=np.matrix(X.values) y=np.matrix(y.values) ## 初始化相应的参数 theta=np.matrix(np.array([0,0])).T alpha=0.01 max_iters=1500 theta,cost=gradientDescent(X,y,theta,alpha,max_iters) ## 绘制曲线 x=np.linspace(data.Population.min(),data.Population.max()) y=theta[0,0]+theta[1,0]*x figure,axis=plt.subplots() axis.plot(x,y,'r',label='Prediction') axis.scatter(x=data['Population'],y=data['Profit'],label='Training Data') axis.set_xlabel('Population') axis.set_ylabel('Profit') plt.show()

1.2 多特征的线性回归 非常类似于上部分的代码,只是样本的特征数量增加了。
import pandas as pd import numpy as np import matplotlib.pyplot as pltdef computeCost(X,y,theta): m=len(X) J=np.sum(np.power(X*theta-y,2))/(2*m) return Jdef gradientDescent(X,y,theta,alpha,iters): m=len(X) cost=np.zeros(iters) for i in range(iters): temp=(X*theta-y).T*X*alpha/m theta=np.subtract(theta,temp.T) cost[i]=computeCost(X,y,theta) return theta,costfile=r'E:\python\MyMachineLearning\exp1\ex1data2.txt' data=https://www.it610.com/article/pd.read_csv(file,names=["size",'bedroom','price'],header=None) # 归一化 data=https://www.it610.com/article/(data-data.mean())/data.std() data.insert(0,'ones',1) cols=data.shape[1] X=data.iloc[:,:-1] y=data.iloc[:,cols-1:cols] X=np.matrix(X.values) y=np.matrix(y.values) theta=np.matrix(np.array([0,0,0])).T alpha=0.01 iters=1500 theta,cost=gradientDescent(X,y,theta,alpha,iters) figure,axis=plt.subplots() x=range(1,1501) y=cost plt.plot(x,y,color='red') plt.show()

1.3 正规方程 (X.T*X)-1 X.Ty
import pandas as pd import numpy as np import matplotlib.pyplot as pltdef normalEquation(X,y): ##计算逆矩阵 temp=np.linalg.inv(X.T*X) theta=temp*X.T*y return theta def computeCost(X,y,theta): m=len(X) J=np.sum(np.power(X*theta-y,2))/(2*m) return Jfile=r'E:\python\MyMachineLearning\exp1\ex1data2.txt' data=https://www.it610.com/article/pd.read_csv(file,names=['Size','Bedroom','Price']) data=https://www.it610.com/article/(data-data.mean())/data.std() data.insert(0,'ones',1) cols=data.shape[1] X=data.iloc[:,:-1] y=data.iloc[:,cols-1:cols] X=np.matrix(X.values) y=np.matrix(y.values) theta=normalEquation(X,y) cost=computeCost(X,y,theta) print(cost)

二、logistic 回归 2.1 plot data
import pandas as pd import matplotlib.pyplot as plt file=r'E:\python\MyMachineLearning\exp2\ex2data1.txt' data=https://www.it610.com/article/pd.read_csv(file,names=['exam1','exam2','Accepted']) positive=data[data['Accepted'].isin([1])] negative=data[data['Accepted'].isin([0])] fig,ax=plt.subplots() ax.scatter(positive['exam1'],positive['exam2'],c='b',marker='o',label='Accepted') ax.scatter(negative['exam1'],negative['exam2'],c='r',marker='x',label='unAccepted') ax.set_xlabel('exam1Score') ax.set_ylabel('exam2Score') plt.show()

2.2 单特征的logistic回归 2.3 多特征的logistic回归
import numpy as np import pandas as pd from scipy.optimize import minimize from scipy.io import loadmatdef sigmoid(z): return 1/(1+np.exp(-z))def computeCost(theta,X,y,lambda_i): X=np.matrix(X) y=np.matrix(y) theta=np.matrix(theta) m=len(X) error=(-y.T)*np.log(sigmoid(X*theta.T))-(1-y.T)*np.log(1-sigmoid(X*theta.T)) reg=np.sum(np.power(theta[:,1:theta.shape[1]],2))*lambda_i/(2*m) return error/m+regdef gradient(theta,X,y,lambda_i): X=np.matrix(X) y=np.matrix(y) theta=np.matrix(theta) m=len(X) error=sigmoid(X*theta.T)-ygrad=((X.T*error)/m).T+(lambda_i*theta)/m grad[0,0]=np.sum(error.T*X[:,0])/m return np.array(grad).ravel()# def gradient(theta, X, y, lambda_i): #theta = np.matrix(theta) #X = np.matrix(X) #y = np.matrix(y) # #parameters = int(theta.ravel().shape[1]) #error = sigmoid(X * theta.T) - y # #grad = ((X.T * error) / len(X)).T + ((lambda_i / len(X)) * theta) # ## intercept gradient is not regularized #grad[0, 0] = np.sum(np.multiply(error, X[:, 0])) / len(X) # #return np.array(grad).ravel()def one_vs_all(X,y,num_labels,lambda_i): rows=X.shape[0] params=X.shape[1] all_theta=np.zeros((num_labels,params+1)) X=np.insert(X,0,values=np.ones(rows),axis=1)for i in range(1,num_labels+1): theta=np.zeros(params+1) y_i=np.array([1 if label==i else 0 for label in y]) y_i=np.reshape(y_i,(rows,1))fmin=minimize(fun=computeCost,x0=theta,args=(X,y_i,lambda_i),method='TNC',jac=gradient) all_theta[i-1,:]=fmin.x return all_theta file=r'E:\python\MyMachineLearning\exp3\ex3data1.mat' data=https://www.it610.com/article/loadmat(file) X=data['X'] y=data['y'] all_theta=one_vs_all(data['X'],data['y'],10,1) print(all_theta)

三、Neural Network 3.1 plot data
import numpy as np from scipy.io import loadmat import pandas as pd import matplotlib import matplotlib.pyplot as pltfile=r'E:\python\MyMachineLearning\exp4\ex4data1.mat' data=https://www.it610.com/article/loadmat(file) X=data['X'] y=data['y'] sample_idx=np.random.choice(np.arange(X.shape[0]),100) sample_img=data['X'][sample_idx,:] fig,ax=plt.subplots(nrows=10,ncols=10,sharey=True,sharex=True) for r in range(10): for c in range(10): ax[r,c].matshow(np.array(sample_img[10*r+c].reshape((20,20))).T,cmap=matplotlib.cm.binary) ## 显示灰度图像 ## 自动调整x,y坐标刻度 plt.xticks(np.array([])) plt.yticks(np.array([])) plt.show()

3.2 Neural Network 这里的神经网络的参数为theta 包含了w和b
import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib from scipy.io import loadmat from sklearn.preprocessing import OneHotEncoder from scipy.optimize import minimizefile=r'E:\python\MyMachineLearning\exp4\ex4data1.mat' data=https://www.it610.com/article/loadmat(file) X=data['X'] y=data['y'] weight = loadmat("E:\python\MyMachineLearning\exp4\ex4weights.mat") theta1, theta2 = weight['Theta1'], weight['Theta2']def sigmoid(z): return 1/(1+np.exp(-z))##前向传播 def forward_propagate(X,theta1,theta2): m=X.shape[0] a1=np.insert(X,0,values=np.ones(m),axis=1) z2=a1*theta1.T a2=np.insert(sigmoid(z2),0,np.ones(m),axis=1) z3=a2*theta2.T h=sigmoid(z3) return a1,z2,a2,z3,hdef computeCost(theta1,theta2,input_size,hidden_size,num_labels,X,y,lambda_i): m=X.shape[0] X=np.matrix(X) y=np.matrix(y)## 正向传播 a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2) ## 对应的神经元相乘 error=np.multiply(-y,np.log(h))-np.multiply(1-y,np.log(1-h)) reg=(lambda_i*1.0/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))return np.sum(error)/m+regdef sigmoid_gradient(z): return np.multiply(sigmoid(z),(1-sigmoid(z)))## 反向传播 def backprop(params,input_size,hidden_size,num_labels,X,y): m=X.shape[0] X=np.matrix(X) y=np.matrix(y) ## 参数reshape theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))) theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):],(num_labels,(hidden_size+1))))a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)J=0 delta1=np.zeros(theta1.shape) delta2=np.zeros(theta2.shape)error=np.multiply((-y),np.log(h))-np.multiply((1-y),np.log(1-h)) J=np.sum(error)/m ## 反向传播 delta3=h-y delta2=delta3*theta2 delta2=delta2[:,1:] delta2=np.multiply(delta2,sigmoid_gradient(z2))Delta1 = np.zeros(theta1.shape) Delta2 = np.zeros(theta2.shape) Delta2+=delta3.T*a2 Delta1+=delta2.T*a1 Delta2=Delta2/m Delta1=Delta1/mreturn J,Delta1,Delta2def backpropReg(params,input_size,hidden_size,num_labels,X,y,lambda_i): m=X.shape[0] X=np.matrix(X) y=np.matrix(y) theta1=np.matrix(np.reshape(params[:(hidden_size)*(input_size+1)],(hidden_size,(input_size+1)))) theta2=np.matrix(np.reshape(params[(hidden_size)*(input_size+1):],(num_labels,(hidden_size+1)))) a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2) J = 0 delta1 = np.zeros(theta1.shape) delta2 = np.zeros(theta2.shape)error = np.multiply((-y), np.log(h)) - np.multiply((1 - y), np.log(1 - h)) reg = (lambda_i * 1.0 / (2 * m)) * (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2))) J = np.sum(error) / m+regdelta3=h-y delta2=delta3*theta2 delta2=delta2[:,1:] delta2=np.multiply(delta2,sigmoid_gradient(z2)) Delta1 = np.zeros(theta1.shape) Delta2 = np.zeros(theta2.shape) Delta2 += delta3.T * a2 Delta1 += delta2.T * a1 Delta2 = Delta2 / m Delta1 = Delta1 / m Delta1[:, 1:] = Delta1[:, 1:] + (theta1[:, 1:] * lambda_i) / m Delta2[:, 1:] = Delta2[:, 1:] + (theta2[:, 1:] * lambda_i) / m ## 将俩个矩阵降维一维 grad = np.concatenate((np.ravel(Delta1), np.ravel(Delta2))) return J,grad## 自动转成矩阵形式 encoder=OneHotEncoder(sparse=False) y_onehot=encoder.fit_transform(y) input_size = 400 hidden_size = 25 num_labels = 10 lambda_i=1 ## 随机初始theta 减去0.5保证均值为0 params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.24 ## 或者对于theta1 theta2 分别进行初始化 # theta1=np.random.random(size=hidden_size * (input_size + 1))*(2*0.12)-0.12fmin=minimize(fun=backpropReg,x0=(params),args=(input_size,hidden_size,num_labels,X,y_onehot,lambda_i),method='TNC',jac=True,options={'maxiter':250}) print(fmin)

四、交叉验证误差 【机器学习|机器学习python代码】本代码中使用的算法为线性回归
file=r'E:\python\MyMachineLearning\exp5\ex5data1.mat' data=https://www.it610.com/article/sio.loadmat(file) X=data['X'] y=data['y'] X, y, Xval, yval, Xtest, ytest = map(np.ravel,[data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']]) X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]theta=np.ones(X.shape[1]) lambda_i=1 training_cost, cv_cost = [], [] ## 样本总数 m=X.shape[0] ## 考虑随着样本数的增加,对应的误差(训练误差和交叉验证误差) for i in range(1,m+1): res = linear_regression(X[:i, :], y[:i], 0)tc = computeCostReg(res.x, X[:i, :], y[:i], 0) cv = computeCostReg(res.x, Xval, yval, 0)training_cost.append(tc) cv_cost.append(cv)## 寻找最佳的lambda_i ## 一般按照0.003增长 l_candidate=[0,0.001,0.003,0.01,0.03,0.1,0.3,1.3,10] for l in l_candidate: res = linear_regression(X, y, l)tc = computeCost(res.x, X, y) cv = computeCost(res.x, Xval, yval)training_cost.append(tc) cv_cost.append(cv)## 绘制对应的图像 fig, ax = plt.subplots() ax.plot(l_candidate, training_cost, label='training') ax.plot(l_candidate, cv_cost, label='cross validation') ## 添加对应的标签 plt.legend() plt.xlabel('lambda') plt.ylabel('cost') plt.show()## 在测试集上机选误差,来进行评估 for l in l_candidate: theta = linear_regression(X, y, l).x print('test cost(l={}) = {}'.format(l, computeCost(theta, Xtest, ytest)))

五、SVM大间距分类器 这里用的是sklearn 包中的svm。
5.1 线性svm
import numpy as np import pandas as pd import scipy.io as sio from sklearn.metrics import classification_report from sklearn import svm import matplotlib.pyplot as pltdef decision_boundary(svc,X1min,X1max,X2min,X2max,diff): x1=np.linspace(X1min,X1max,1000) x2=np.linspace(X2min,X2max,1000) cordinates=[(x,y) for x in x1 for y in x2] x_cord,y_cord=zip(*cordinates) c_val=pd.DataFrame({'x1':x_cord,'x2':y_cord}) ## 计算与决策边界的距离 c_val['cval']=svc.decision_function(c_val[['x1','x2']])##取出位于样本分界线的点 decision=c_val[np.abs(c_val['cval'])

5.2 非线性SVM 这里用的是高斯核函数
高斯核函数如下
## 高斯核函数 def gaussian_kernel(x1,x2,sigma): return np.exp(-np.sum((x1-x2)**2)/(2*(sigma**2)))

训练代码如下
file=r'E:\python\MyMachineLearning\exp6\ex6data2.mat' init_data=https://www.it610.com/article/sio.loadmat(file) X=init_data['X'] data=https://www.it610.com/article/pd.DataFrame(X,columns=['X1','X2']) data['y']=init_data['y']## 使用内置的高斯内核进行数据拟合 svc=svm.SVC(C=100,gamma=10,probability=True) svc.fit(data[['X1','X2']],data['y']) res=svc.score(data[['X1','X2']],data['y'])

六、K-means 6.1 K-Means应用
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sb import scipy.io as sio## 计算每个点最近的中心 def find_closet_centroids(X,centroids): m=X.shape[0] k=centroids.shape[0] idx=np.zeros(m) for i in range(m): min_dist=1000000 for j in range(k): ## 计算点到所有中心的距离 dist=np.sum((X[i,:]-centroids[j,:])**2) if(dist < min_dist): min_dist=dist idx[i]=j return idx#计算聚类中心 def compute_centroids(X,idx,k): m,n=X.shape centroids=np.zeros((k,n)) for i in range(k): ## 获取对应中心的所有样本 indices=np.where(idx==i) centroids[i,:]=(np.sum(X[indices,:],axis=1)/len(indices[0])).ravel() return centroids## K_mean 算法 def run_k_means(X,initial_centroids,max_iters): m,n=X.shape k=initial_centroids.shape[0] idx=np.zeros(m) centroid=initial_centroids for i in range(max_iters): idx=find_closet_centroids(X,centroid) centroid=compute_centroids(X,idx,k) return idx,centroid## 随机初始化k个聚类中心 def init_centroids(X,k): m,n=X.shape centroids=np.zeros((k,n)) ## 随机初始化 0-m 的k个数字 idx=np.random.randint(0,m,k) for i in range(k): centroids[i,:]=X[idx[i],:] return centroidsfile=r'E:\python\MyMachineLearning\exp7\ex7data2.mat' init_data=https://www.it610.com/article/sio.loadmat(file) X=init_data['X'] ## 随机初始化聚类中心 initial_centroids = init_centroids(X,3) idx, centroids = run_k_means(X, initial_centroids, 10) ## 位于不同中心的点 cluster1=X[np.where(idx==0)[0],:] cluster2=X[np.where(idx==1)[0],:] cluster3=X[np.where(idx==2)[0],:] fig,ax=plt.subplots() ax.scatter(cluster1[:,0],cluster1[:,1],c='b',label='Cluster 1') ax.scatter(cluster2[:,0],cluster2[:,1],c='r',label='Cluster 2') ax.scatter(cluster3[:,0],cluster3[:,1],c='g',label='Cluster 3') ax.legend() plt.show()

同样,我们也可以使用sklearn自带的KMeans来完成训练
## n_init 表示随机初始化聚类中心的次数n_job表示并行的cpu数,-1表示使用所有cpu model=KMeans(n_clusters=16,n_init=100,n_jobs=-1) model.fit(data) #获取聚簇中心 centroids = model.cluster_centers_ ## #获取每个数据点的对应聚簇中心的索引 C = model.predict(data)

6.2 K-Means应用——图片压缩
import numpy as np import pandas as pd import scipy.io as sio from IPython.display import Image import matplotlib.pyplot as pltdef computeMinDist(X,centroids): m = X.shape[0] k = centroids.shape[0] idx = np.zeros(m) for i in range(m): min_dist = 1000000 for j in range(k): ## 计算点到所有中心的距离 dist = np.sum((X[i, :] - centroids[j, :]) ** 2) if (dist < min_dist): min_dist = dist idx[i] = j return idxdef compute_centroids(X,idx,k): m, n = X.shape centroids = np.zeros((k, n)) for i in range(k): ## 获取对应中心的所有样本 indices = np.where(idx == i) centroids[i, :] = (np.sum(X[indices, :], axis=1) / len(indices[0])).ravel() return centroidsdef init_centroids(X,k): m,n=X.shape idx=np.random.randint(0,m,k) centroids=np.zeros((k,n)) for i in range(k): centroids[i,:]=X[idx[i],:] return centroids#k_mean 算法 def k_mean(X,initial_centroids,max_iters): m, n = X.shape k = initial_centroids.shape[0] idx = np.zeros(m) centroid = initial_centroids for i in range(max_iters): idx = computeMinDist(X, centroid) centroid = compute_centroids(X, idx, k) return idx, centroidfile=r'E:\python\MyMachineLearning\exp7\bird_small.png' imgFile=r'E:\python\MyMachineLearning\exp7\bird_small.mat' image_data=https://www.it610.com/article/sio.loadmat(imgFile) A=image_data['A']## normalize A=A/255X=np.reshape(A,(A.shape[0]*A.shape[1],A.shape[2]))initial_centroids=init_centroids(X,16) idx, centroids = k_mean(X, initial_centroids, 10)idx = computeMinDist(X, centroids)## 将每个像素映射到聚类中心的值 X_recovered = centroids[idx.astype(int),:] ## 重新调整维度 X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2])) plt.imshow(X_recovered) plt.show()

七、PCA 主成分分析 7.1 PCA
import numpy as np import scipy.io as sio import matplotlib.pyplot as pltdef pca(X): ## standardization m = X.shape[0] X = (X - X.mean(axis=0)) / X.std(axis=0)X=np.matrix(X)## 计算协方差矩阵cov=X.T*X/m ## 奇异值分解 U,S,V=np.linalg.svd(cov) return U,S,V## 选择k个降维后的数据 def project_data(X,U,k): U_reduced=U[:,:k] return X*U_reduced## 恢复降维的数据 def recover_data(Z,U,k): U_reduced = U[:, :k] return Z*U_reduced.T## 选择合适的k值 def choose_K(X,lambda_i): U,S,V=pca(X) res=1000 all_sum=sum(S) for i in range(1,100): temp=sum(S[:i]) if(temp/all_sum>=lambda_i): res=i break return res## pca 可用于数据的降维 file=r'E:\python\MyMachineLearning\exp7\ex7data1.mat' data=https://www.it610.com/article/sio.loadmat(file) X=data['X']## 数据可视化 # fig,ax=plt.subplots() # ax.scatter(X[:,0],X[:,1],c='r') # plt.show()U, S, V = pca(X) Z = project_data(X, U, 1) w=recover_data(Z,U,1)##第一主成分的投影轴基本上是数据集中的对角线。 当我们将数据减少到一个维度时, # 我们失去了该对角线周围的变化,所以在我们的再现中,一切都沿着该对角线。

7.2 PCA 应用 ——图片压缩
import numpy as np import pandas as pd import scipy.io as sio import matplotlib.pyplot as pltdef pca(X): ##normalize X=(X-X.mean())/X.std() X=np.matrix(X) m=X.shape[0] cov=X.T*X/m U,S,V=np.linalg.svd(cov) return U,S,V## 数据压缩 def project_data(X,U,k): U_reduced=U[:,:k] return X*U_reduced.T## 数据恢复 def recover_data(X,U,k): U_rediced=U[:,:k] return z*U_rediced.T## 显示前n张图像 def plod_n_image(X,n): pic_size=int(np.sqrt(X.shape[1])) grid_size=int(np.sqrt(n)) face_image=X[:n,:] fig,ax=plt.subplots(nrows=grid_size,ncols=grid_size,sharex=True,sharey=True)for r in range(grid_size): for c in range(grid_size): ax[r,c].imshow(face_image[grid_size*r+c].reshape((pic_size,pic_size))) plt.xticks(np.array([])) plt.yticks(np.array([])) plt.show()file=r'E:\python\MyMachineLearning\exp7\ex7faces.mat' face_data=https://www.it610.com/article/sio.loadmat(file) X=face_data['X'] U,S,V=pca(X) z=project_data(X,U,100)

八、异常检测 这里使用scipy库中的内置函数norm
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.io as sio from scipy import stats import sklearn.preprocessing## 计算每个特征的方差和均值 def compute_gaussian(X): ## axis=0 表示对列进行平均 mu=X.mean(axis=0) sigma=X.var(axis=0) return mu,sigma## 寻找好的阈值 def select_threshold(pval,yval): best_epsilon=0 best_f1=0 f1=0 ## 步长 step=(pval.max()-pval.min())/1000 for epsilon in np.arange(pval.min(),pval.max(),step): preds=pvalbest_f1): best_f1=f1 best_epsilon=epsilon return best_epsilon,best_f1file=r'E:\python\MyMachineLearning\exp8\ex8data1.mat' data=https://www.it610.com/article/sio.loadmat(file) X=data['X']# 数据可视化 # fig,ax=plt.subplots() # ax.scatter(X[:,0],X[:,1]) # plt.show()mu,sigma=compute_gaussian(X) ## 绘制高斯分布的图形 # fig,ax=plt.subplots() # xplot=np.linspace(0,25,100) # yplot=np.linspace(0.25,100) # ## 生成网格,返回x坐标和y坐标 # Xplot,Yplot=np.meshgrid(xplot,yplot) # #计算 对应的值 # Z = np.exp((-0.5)*((Xplot-mu[0])**2/sigma[0]+(Yplot-mu[1])**2/sigma[1])) # ## 绘制等值图 # contour = plt.contour(Xplot, Yplot, Z,[10**-11, 10**-7, 10**-5, 10**-3, 0.1],colors='k') # ax.scatter(X[:,0], X[:,1]) # plt.show()## 通过验证集来确定 阈值(用于判断异常点) Xval=data['Xval'] yval=data['yval']## scipy 内置函数,用于给出对应点正态分布概率 # dist=stats.norm(mu[0],sigma[0]) # dist.pdf(X[:,0])p=np.zeros((Xval.shape[0],Xval.shape[1])) p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0]) p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1])pval = np.zeros((Xval.shape[0], Xval.shape[1])) pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0]) pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])##可视化分类结果 # epsilon, f1 = select_threshold(pval, yval) # outliers = np.where(p < epsilon) # fig,ax=plt.subplots() # ax.scatter(X[:,0],X[:,1]) # ax.scatter(outliers[:,0],outliers[:,1],c='r',marker='x') # plt.show()

对于多元高斯分布模型的创建如下
## 多元高斯分布 mu=X.mean(axis=0) cov=np.cov(X.T) ## 创建多元高斯分布模型 multi_normal = stats.multivariate_normal(mu, cov)

    推荐阅读