河南郑州二手房房价预测
- 数据集
-
- 数据标准化
- 分割数据集和训练集
- 数据归一化
- 预测模型
-
- 数据未归一化前的随机森林预测
- 数据归一化过后的随机森林预测
- SVM径向基核函数预测
- KNN最邻近算法预测
- 决策树回归预测
- 梯度提升决策分类预测
- SVM线性核函数预测
- 使用不同的train_size去训练模型
- 模型评估
- 河南郑州二手房房价分析
数据集 【机器学习|河南郑州二手房房价预测】
文章图片
data_all.info()
文章图片
data_all.head()
文章图片
数据标准化 特征的归一化
data_all['unit_price'].astype(float)train_data = https://www.it610.com/article/data_all[data_all.columns.delete([0,1,2])]
data_temp = data_all[data_all.columns.delete([0,1])]
# train_datatitle_list = ['house_region','house_address','house_structure','house_elevator_sytle',
'house_heating_type','elevator','house_transaction_type','house_useage',
'house_years','house_floor_position','house_building_structure','house_orientation',
'house_building_type','house_layout','house_area','house_decoration','house_last_time'
]
for item in title_list:
pclassDf = pd.DataFrame()
#使用get_dummies进行one-hot编码,列名前缀是Pclass
pclassDf = pd.get_dummies( data_all[item] , prefix=item ).astype("int")
train_data = https://www.it610.com/article/pd.concat([train_data,pclassDf],axis=1)
train_data.drop(columns=[item],inplace=True)data_temp = pd.concat([data_temp,pclassDf],axis=1)
data_temp.drop(columns=[item],inplace=True)# print(data_all)
# print(train_data)
# print(train_data.head())
print(data_temp.head())# from sklearn.decomposition import PCA# 主成分分析法,非监督的机器学习方法# model = PCA(n_components=1, random_state=25)# n_components PCA算法中所要保留的主成分个数n,也即保留下来的特征个数n
# # 多变量之间可能存在相关性,从而增加了问题分析的复杂性。
# # 在减少需要分析的指标同时,尽量减少原指标包含信息的损失,以达到对所收集数据进行全面分析的目的。
# house_train = model.fit_transform(house_train)# fit_transform是fit和transform的组合,既包括了训练又包含了转换。训练PCA模型,同时返回降维后的数据# model1 = PCA(n_components=1, random_state=25)# n_components PCA算法中所要保留的主成分个数n,也即保留下来的特征个数n
# house_test_label = model1.fit_transform(house_test_label)
# house_train
# house_test_label
文章图片
分割数据集和训练集
from sklearn.model_selection import train_test_split# train_data = https://www.it610.com/article/data_all.iloc[:, 3:31]
# train_data = data_all[data_all.columns.delete([0,1,2])]
train_label = data_all.iloc[:, 2]
house_train, house_test, house_train_label, house_test_label = train_test_split /
(train_data, train_label, test_size=0.2, random_state=42)
print("训练数据集:",house_train)
print("训练数据标签:",house_train_label)
print()
print("测试数据集:",house_test)
print("测试数据标签:",house_test_label)
print(data_all.info())
文章图片
数据归一化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler# 归一化
# data_temp.info()
scaler_train_data = https://www.it610.com/article/MinMaxScaler()
scaler_train_data.fit(data_temp)
scaler_train_data_features = scaler_train_data.transform(data_temp)
scaler_train_data_features = pd.DataFrame(scaler_train_data_features)
scaler_train_data_features.columns = data_temp.columns
print(scaler_train_data_features)train_data = scaler_train_data_features[scaler_train_data_features.columns.delete([0])]
train_label = scaler_train_data_features.iloc[:, 0]
house_train, house_test, house_train_label, house_test_label = train_test_split /
(train_data, train_label, test_size=0.2, random_state=25)
print("训练数据集:",house_train)
print("训练数据标签:",house_train_label)
print()
print("测试数据集:",house_test)
print("测试数据标签:",house_test_label)
文章图片
house_test.head()
文章图片
house_test_label.head()
文章图片
house_test.info()
文章图片
预测模型 数据未归一化前的随机森林预测
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor() # 随机森林,基于树的方法是不需要进行特征的归一化
rfr.fit(house_train,house_train_label)
rfr.score(house_test,house_test_label)
正确率:
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = rfr.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
数据归一化过后的随机森林预测
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor() # 随机森林,基于树的方法是不需要进行特征的归一化
rfr.fit(house_train,house_train_label)
rfr.score(house_test,house_test_label)
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = rfr.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
SVM径向基核函数预测
r_svr = SVR(kernel="rbf")#径向基核函数
r_svr.fit(house_train,house_train_label)
r_svr.score(house_test,house_test_label)
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = r_svr.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
KNN最邻近算法预测
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(weights="uniform")#K临近回归器
knn.fit(house_train,house_train_label)
knn.score(house_test,house_test_label)
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = knn.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
决策树回归预测
from sklearn.tree import DecisionTreeRegressor
#决策树回归
dt = DecisionTreeRegressor()
dt.fit(house_train,house_train_label)
dt.score(house_test,house_test_label)
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = dt.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
梯度提升决策分类预测
from sklearn.ensemble import GradientBoostingRegressor#提升树
gbr = GradientBoostingRegressor()
gbr.fit(house_train,house_train_label)
gbr.score(house_test,house_test_label)
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = gbr.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
SVM线性核函数预测
from sklearn.svm import SVR
l_svr = SVR(kernel='linear')#线性核函数
l_svr.fit(house_train,house_train_label)
l_svr.score(house_test,house_test_label)
print(l_svr.score(house_test,house_test_label))
文章图片
plt.figure(figsize=(12, 6),facecolor='white')
y_pre = l_svr.predict(house_test)
plt.scatter(y_pre,house_test_label,marker='o')
plt.scatter(house_test_label,house_test_label)
plt.show()
文章图片
使用不同的train_size去训练模型
#建立模型用的训练数据集和测试数据集
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
size=np.arange(0.6,1,0.1)
scorelist=[[],[],[],[],[],[]]
from sklearn.model_selection import train_test_split
for i in range(0,4):house_train, house_test, house_train_label, house_test_label = train_test_split \
(train_data , train_label,train_size=size[i],random_state=5)
print("size: ",size[i])model = SVR(kernel='linear')#线性核函数
model.fit(house_train,house_train_label)
print("线性核函数",model.score(house_test,house_test_label))
scorelist[0].append(model.score(house_test , house_test_label ))model = RandomForestRegressor() # 随机森林,基于树的方法是不需要进行特征的归一化
model.fit(house_train,house_train_label)
model.score(house_test,house_test_label)
print("随机森林",model.score(house_test,house_test_label))
scorelist[1].append(model.score(house_test , house_test_label ))model = SVR(kernel="rbf")#径向基核函数
model.fit(house_train,house_train_label)
scorelist[2].append(model.score(house_test , house_test_label ))
print("径向基核函数",model.score(house_test,house_test_label))model = KNeighborsRegressor(weights="uniform")#K临近回归器
model.fit(house_train,house_train_label)
scorelist[3].append(model.score(house_test , house_test_label ))
print("K临近回归器",model.score(house_test,house_test_label))#决策树回归
model = DecisionTreeRegressor()
model.fit(house_train,house_train_label)
scorelist[4].append(model.score(house_test , house_test_label ))
print("决策树回归",model.score(house_test,house_test_label))model = GradientBoostingRegressor()
model.fit(house_train,house_train_label)
scorelist[5].append(model.score(house_test , house_test_label ))
print("梯度提升决策分类",model.score(house_test,house_test_label))
文章图片
模型评估
plt.figure(figsize=(12, 6),facecolor='white')plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
color_list = ('red', 'blue', 'lightgreen', 'cornflowerblue', 'turquoise', 'magenta')
for i in range(0,6):
#print(size,scorelist[i])
#print(scorelist[i])
plt.plot(size,scorelist[i],color=color_list[i])plt.legend(['SVM线性核函数', '随机森林','SVM径向基核函数','KNN最近邻','决策树回归','梯度提升决策分类'])
plt.xlabel('训练集占比')
plt.ylabel('准确率')
plt.title('不同的模型随着训练集占比变化曲线')
plt.show()
文章图片
河南郑州二手房房价分析 河南郑州二手房房价分析
推荐阅读
- java|为什么使用开源软件_为什么要使用开源软件()
- java|为什么要使用开源软件()
- leetcode|LeetCode 48. Rotate Image 时间复杂度(O(n))
- 大数据|关于Vision Transformer的一些思考
- YOLO|YOLOX网络结构
- 新品发布会|数据可视化编辑平台上线,小程序也能拥有可视化图层
- 算法|K-means,K-means++方法详解-机器学习分类问题常见算法
- 机器学习|bagging && boosting && stacking 集成学习
- 数据分析|数据分析学习1——数据获取,单因子探索分析与可视化