from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,2.],
[ 2.,0.,0.],
[ 0.,1., -1.]])
- 1
- 2
- 3
- 4
- 5
X_scaled = preprocessing.scale(X_train)
- 1
X_scaled
- 1
array([[ 0., -1.22474487,1.33630621],
[ 1.22474487,0., -0.26726124],
[-1.22474487,1.22474487, -1.06904497]])
- 1
- 2
- 3
- 4
X_scaled.mean(axis=0),X_scaled.std(axis=0)
- 1
(array([ 0.,0.,0.]), array([ 1.,1.,1.]))
- 1
- 2
计算后得到1?31?3的结果,即对每一列数据求平均值。
2、使得放缩在某个范围[0,1]
X_train = np.array([[ 1., -1.,2.],
[ 2.,0.,0.],
[ 0.,1., -1.]])
- 1
- 2
- 3
min_max_scaler = preprocessing.MinMaxScaler()
- 1
X_train_minmax = min_max_scaler.fit_transform(X_train)
- 1
X_train_minmax
- 1
array([[ 0.5,0.,1.],
[ 1.,0.5,0.33333333],
[ 0.,1.,0.]])
- 1
- 2
- 3
- 4
X_test = np.array([[ -3., -1.,4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax
- 1
- 2
- 3
array([[-1.5,0.,1.66666667]])
- 1
- 2
X_train = np.array([[ 1., -1.,2.],
[ 2.,0.,0.],
[ 0.,1., -1.]])max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs# doctest +NORMALIZE_WHITESPACE^
- 1
- 2
- 3
- 4
- 5
- 6
- 7
array([[ 0.5, -1. ,1. ],
[ 1. ,0. ,0. ],
[ 0. ,1. , -0.5]])
- 1
- 2
- 3
- 4
4、缩放稀疏数据
MaxAbsScaler和maxabs_scale专门用于扩展稀疏数据
5、二进制归一化
X = [[ 1., -1.,2.],
[ 2.,0.,0.],
[ 0.,1., -1.]]binarizer = preprocessing.Binarizer().fit(X)# fit does nothing
binarizer
- 1
- 2
- 3
- 4
- 5
- 6
Binarizer(copy=True, threshold=0.0)
- 1
- 2
binarizer.transform(X)
- 1
array([[ 1.,0.,1.],
[ 1.,0.,0.],
[ 0.,1.,0.]])
- 1
- 2
- 3
- 4
#设定0,1划分值
binarizer = preprocessing.Binarizer(threshold=1.1)
binarizer.transform(X)
- 1
- 2
- 3
array([[ 0.,0.,1.],
[ 1.,0.,0.],
[ 0.,0.,0.]])
- 1
- 2
- 3
- 4
特征不是连续值而是分类
属性可取的离散值:[“male”, “female”][“from Europe”, “from US”, “from Asia”][“uses Firefox”, “uses Chrome”, “uses Safari”, “uses Internet Explorer”]
[“male”, “from US”, “uses Internet Explorer”]可以表示:[0, 1, 3]
[“female”, “from Asia”, “uses Chrome”]可以表示:[1, 2, 1]
#给定数据,可以自动推断属性的类别数
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
- 1
- 2
- 3
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
- 1
- 2
- 3
#对一个数据进行重新编码
enc.transform([[0, 1, 3]]).toarray()
- 1
- 2
array([[ 1.,0.,0.,1.,0.,0.,0.,0.,1.]])
- 1
- 2
#编码长度是9,即2+3+4,对应上面每个类别属性长度
- 1
#显示给出每个类别属性个数
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
# Note that there are missing categorical values for the 2nd and 3rd
# features
enc.fit([[1, 2, 3], [0, 2, 0]])
- 1
- 2
- 3
- 4
- 5
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values=[2, 3, 4], sparse=True)
- 1
- 2
- 3
enc.transform([[1, 0, 0]]).toarray()
- 1
array([[ 0.,1.,1.,0.,0.,1.,0.,0.,0.]])
- 1
- 2
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
- 1
- 2
data=https://www.it610.com/article/[[1, 2], [np.nan, 3], [7, 6]]
- 1
data
- 1
[[1, 2], [nan, 3], [7, 6]]
- 1
- 2
imp.fit(data)
- 1
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
- 1
- 2
X = [[np.nan, 2], [6, np.nan], [7, 6]]
- 1
#用平均值去填充nan
print(imp.transform(X))
- 1
- 2
[[ 4.2.]
[ 6.3.66666667]
[ 7.6.]]
- 1
- 2
- 3
- 4
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
X
- 1
- 2
- 3
- 4
array([[0, 1],
[2, 3],
[4, 5]])
- 1
- 2
- 3
- 4
poly = PolynomialFeatures(2)
poly.fit_transform(X)
- 1
- 2
array([[1.,0.,1.,0.,0.,1.],
[1.,2.,3.,4.,6.,9.],
[1.,4.,5.,16.,20.,25.]])
- 1
- 2
- 3
- 4
#np.log()、np.log10()、np.log2()、np.log1p()
#分别为自然对数(e)、底数为10、底数为2、log(1+x)
- 1
- 2
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
- 1
- 2
- 3
- 4
- 5
array([[ 0.,0.69314718],
[ 1.09861229,1.38629436]])
- 1
- 2
- 3
def f(x):
return x**2
- 1
- 2
transformer = FunctionTransformer(f)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
- 1
- 2
- 3
array([[0, 1],
[4, 9]])
推荐阅读
- 人工智能|干货!人体姿态估计与运动预测
- 分析COMP122 The Caesar Cipher
- 技术|为参加2021年蓝桥杯Java软件开发大学B组细心整理常见基础知识、搜索和常用算法解析例题(持续更新...)
- C语言学习(bit)|16.C语言进阶——深度剖析数据在内存中的存储
- Python机器学习基础与进阶|Python机器学习--集成学习算法--XGBoost算法
- 数据结构与算法|【算法】力扣第 266场周赛
- 数据结构和算法|LeetCode 的正确使用方式
- leetcode|今天开始记录自己的力扣之路
- 人工智能|【机器学习】深度盘点(详细介绍 Python 中的 7 种交叉验证方法!)
- 网络|简单聊聊压缩网络