机器学习第8章--预测回归


regression.py
from numpy import*
import matplotlib.pyplot as plt
#数据导入函数,返回dataMat,labelMat
def loadDataSet(fileName):# generalfunction to parse tab -delimited floats
numFeat =len(open(fileName).readline().split('\t')) - 1 # get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine =line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))#列表的列表
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat

#标准回归函数,返回ws
def standRegres(xArr, yArr):
xMat = mat(xArr);
yMat = mat(yArr).T
xTx = xMat.T * xMat
if linalg.det(xTx) == 0.0:
print("This matrix issingular, cannot do inverse")
return
ws = xTx.I * (xMat.T * yMat)
return ws
#测试普通的拟合
def _test_8_1():
xArr,yArr=loadDataSet("ex0.txt")
#print(xArr[0:2])
#print(yArr[0:2])
xMat=mat(xArr)
yMat=mat(yArr)
fig=plt.figure()
ax=fig.add_subplot(111)
t=ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
#拟合直线
xCopy=xMat.copy()
xCopy.sort(0)
ws=standRegres(xArr,yArr)
yHat=xCopy*ws
t=ax.plot(xCopy[:,1],yHat)
t=ax.plot(ws)
print(ws)
plt.show()

#_test_8_1()

#局部加权线性回归函数,返回ws,这里是单个点的估计
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m):#next 2 lines createweights matrix
diffMat = testPoint -xMat[j,:]#
weights[j,j] =exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print("This matrix issingular, cannot do inverse")
return
ws = xTx.I * (xMat.T * (weights *yMat))
return testPoint * ws
#返回所有点的估计(yHat)
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] =lwlr(testArr[i],xArr,yArr,k)
return yHat

#测试局部加权拟合
def _test_8_2(k):
xArr,yArr=loadDataSet("ex0.txt")
yHat=lwlrTest(xArr,xArr,yArr,k)
xMat=mat(xArr)
yMat=mat(yArr)
srtInd=xMat[:,1].argsort(0)
xSort=xMat[srtInd][:,0,:]
fig=plt.figure()
ax=fig.add_subplot(111)
t=ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0],s=2,c='red')
#拟合直线
ws=standRegres(xArr,yArr)
t=ax.plot(xSort[:,1],yHat[srtInd])
plt.show()

#_test_8_2(0.01) #k=1时欠拟合,k=0.003时过拟合
#测试误差,模拟核
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
def _test_wucha_():
abX,abY=loadDataSet("abalone.txt")
yHat01=lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
yHat1 = lwlrTest(abX[0:99],abX[0:99],abY[0:99], 1)
yHat10 =lwlrTest(abX[0:99],abX[0:99], abY[0:99], 10)
print(rssError(abY[0:99],yHat01.T))
print(rssError(abY[0:99],yHat1.T))
print(rssError(abY[0:99],yHat10.T))
print("Look at the newdata\n")
yHat01=lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1)
yHat1 =lwlrTest(abX[100:199],abX[0:99], abY[0:99], 1)
yHat10 = lwlrTest(abX[100:199],abX[0:99],abY[0:99], 10)
print(rssError(abY[100:199],yHat01.T))
print(rssError(abY[100:199],yHat1.T))
print(rssError(abY[100:199],yHat10.T))
#_test_wucha_()

#返回的单个的系数向量
def ridgeRegres(xMat, yMat, lam=0.2):
xTx = xMat.T * xMat
denom = xTx + eye(shape(xMat)[1]) *lam
if linalg.det(denom) == 0.0:
print("This matrix issingular, cannot do inverse")
return
ws = denom.I * (xMat.T * yMat)
return ws
#返回的是系数矩阵,系数矩阵里面的每一个系数向量对应了一个不同的拉姆达
def ridgeTest(xArr, yArr):
xMat = mat(xArr);
yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean# to eliminate X0 take mean off of Y
# regularize X's
xMeans = mean(xMat, 0)# calc mean then subtract it off
xVar = var(xMat, 0)# calc variance of Xi then divide by it
xMat = (xMat - xMeans) / xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat, yMat,exp(i - 10))
wMat[i, :] = ws.T
return wMat
#测试一下加入拉姆达之后鲍鱼的年龄预测的误差还是不是和用局部线性回归那样大了,
#绘制了回归系数随着拉姆达的变化的变化,便于在拉姆达一定范围内去寻求最佳拟合。
def _test_8_3():
abX,abY=loadDataSet("abalone.txt")
ridgeWeights=ridgeTest(abX,abY)
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(ridgeWeights)
print(ridgeWeights)
plt.show()
#_test_8_3()
#输入矩阵标准化
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0)#calc mean then subtract it off
inVar = var(inMat,0)#calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
#逐步线性回归算法
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean#can also regularize ys but will getsmaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n)) #testingcode remove
ws = zeros((n,1)); wsTest =ws.copy(); wsMax = ws.copy()
for i in range(numIt):
#print(ws.T)
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE =rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat
#看迭代矩阵的变化
def _test_8_4():
xArr,yArr=loadDataSet("abalone.txt")
ridgeWeights=stageWise(xArr,yArr,0.005,1000)
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()

#xMat=mat(xArr)
#yMat=mat(yArr).T
#xMat=regularize(xMat)
#yM=mean(yMat,0)
#yMat=yMat-yM
#print(standRegres(xMat,yMat.T).T)
_test_8_4()
test_lego.py
from time importsleep
import json
import urllib.request
import socket
socket.setdefaulttimeout(20)# 设置socket层的超时时间为20秒

def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr ='AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json'% (
myAPIstr, setNum)
pg =urllib.request.urlopen(searchURL)
retDict = json.loads(pg.read())
for i inrange(len(retDict['items'])):
try:
currItem = retDict['items'][i]
ifcurrItem['product']['condition'] == 'new':
newFlag = 1
else:
newFlag = 0
listOfInv =currItem['product']['inventories']
for item in listOfInv:
sellingPrice =item['price']
if sellingPrice >origPrc * 0.5:
print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc,sellingPrice))
retX.append([yr,numPce, newFlag, origPrc])
retY.append(sellingPrice)
except:
print('problem with item %d'% i)
pg.close()# 注意关闭response
def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006,800, 49.99)
searchForSet(retX, retY, 10030, 2002,3096, 269.99)
searchForSet(retX, retY, 10179, 2007,5195, 499.99)
searchForSet(retX, retY, 10181, 2007,3428, 199.99)
searchForSet(retX, retY, 10189, 2008,5922, 299.99)
searchForSet(retX, retY, 10196, 2009,3263, 249.99)
lgX=[]; lgY=[]
setDataCollect(lgX, lgY)
【机器学习第8章--预测回归】

    推荐阅读