什么是 Pandas? Numpy 的一个数据处理包,方便做大规模数据处理和整理,有点类似数据库。
基本数据结构
import pandas as pd
import numpy as np# Series:索引在左边,值在右边。若没有创建索引,则默认0 ~ N-1索引
s = pd.Series([1 , 3 , np.nan , 44 , 1]) # np.nan表示空值
print(s)# dataframe是一个表格型的数据结构,包含一组有序的行和列,每列是不同的值类型,有行索引,也有列索引。
dates = pd.date_range("20160101" , periods = 6)
df = pd.DataFrame(np.random.randn(6 , 4) , index = dates , columns = ['a' , 'b' , 'c' , 'd'])#行 index,列 column
print(df)
print(df['b'])#不给定行列标签,默认以0123输出
df1 = pd.DataFrame(np.arange(12).reshape((3 , 4)))
print(df1)df2 = pd.DataFrame({'A' : 1. ,
'B' : pd.Timestamp("20130102") ,
'C' : pd.Series(1 , index = list(range(4)) , dtype = "float32"),
'D' : np.array([3] * 4 , dtype = "float32"),
'E' : pd.Categorical(["test" , "train" , "test" , "train"]),# 类别标签,将类别信息转化成数值信息
'F' : "foo"})print(df2)
print(df2.dtypes)
print(pd.Categorical(df2['E']).codes)# 利用codes将categorical数据转化为编号,数据型转化成数值型
print(df2.index , df2.columns)
print(df2.values)#只看数据
print(df2.describe())# 数据总结:针对这个表格的一些数据分析
print(df2.transpose())#没有sort_column,只有sort_index
print(df2.sort_index(axis = 1 , ascending = False)) #False 代表降序,axis = 1代表按行排序
print(df2.sort_index(axis = 0 , ascending = False)) #False 代表降序,axis = 0代表按列排序#对值进行排序
print(df2.sort_values(by = 'B'))
pandas 选择数据
import pandas as pd
import numpy as npdates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ['A' , 'B' , 'C' , 'D'])# 选择数据
print(df['A'])
print(df[0 : 3])
print(df["20130102" : "20130104"])# 根据标签loc
print(df.loc["20130102"])
print(df.loc[: , ["A" , "B"]])
print(df.loc["20130102" , ["A" , "B"]])#根据序列 iloc
print(df.iloc[3 , 1])
print(df.iloc[3 : 5 , 1 : 3])
print(df.iloc[[1 , 3 , 5] , 1 : 3])# 混合ix : 固定两列 + 三行
print(df.ix[ : 3 , ["A" , "C"]])#判断筛选
print(df[df.B > 10])
pandas 设置值
import pandas as pd
import numpy as np# 创建数据
dates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"])# 赋值
df.iloc[2 , 2] = 1111
df.loc["20130101" , "B"] = 222
df.B[df.A > 4] = 0# 若A的数值 > 4,则更改B的数为 0# 初始化按列初始批处理数据
df["F"] = np.nan
print(df)# 添加数据:不同的数据
df["E"] = pd.Series([1 , 2 , 3 , 4 , 5 , 6] , index = pd.date_range("20130101" , periods = 6))
print(df)
pandas 处理丢失数据
import pandas as pd
import numpy as np# 初始化这些数据为空
dates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"])
df.iloc[0 , 1] = np.nan
df.iloc[1 , 2] = np.nan# dropna() 去掉有 np.nan 的行和列
df.dropna(axis = 0 , how = "any")# "any"只要存在 nan 就 drop 掉,"all" 全是 nan 才 drop 掉
print(df)# fillna() 将 nan 用其他数值代替,比如 0
df.fillna(value = https://www.it610.com/article/0)# isnull() 判断是否有 nan 存在,如果有用 True 表示
df.isnull()
np.any(df.isnull()) == True
pandas 导入导出
import pandas as pd
import numpy as np# 读取 csv文件
data = https://www.it610.com/article/pd.read_csv("F:/train_set.csv")
print(data)# 保存成 pickle(一种数据化文件格式,缩写 pkl )
data.to_pickle("F:/grade123.pickle")
【【机器学习】pandas基础铺垫】
concat 数据合并
import pandas as pd
import numpy as np# pandas合并concat
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
df3 = pd.DataFrame(np.ones((3 , 4)) * 2 , columns = ['a' , 'b' , 'c' , 'd'])# concat纵向合并,ignore_index重置参数,让index可以保持下来
res = pd.concat([df1 , df2 , df3] , axis = 0)
res = pd.concat([df1 , df2 , df3] , axis = 0 , ignore_index = True)
print(res)# join合并方式
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['b' , 'c' , 'd' , 'e'])# 纵向"外"合并 df1和 df2
res = pd.concat([df1 , df2] , axis = 0 , join = 'outer')#outer 所有的column并在一起,如果没有的用 nan补齐
res1 = pd.concat([df1 , df2] , axis = 0 , join = 'inner')#inner 相同的column交在一起
print(res)
print(res1)#join_axes 按照axes合并
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
df3 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
s1 = pd.Series([1 , 2 , 3 , 4] , index = ['a' , 'b' , 'c' , 'd'])res = pd.concat([df1 , df2] , axis = 1) #行并起来,不足的地方nan补齐
print(res)# append:只能纵向合并,将df2合并到df1下面,重置index
res1 = df1.append(df2 , ignore_index = True)
print(res1)
res2 = df1.append([df2 , df3] , ignore_index = True)#也可以合并多个
print(res2)res3 = df1.append(s1 , ignore_index = True) #Series加入一行
print(res3)
merge 数据合并
import pandas as pdleft = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] ,
"A" : ["A0" , "A1" , "A2" , "A3"],
"B" : ["B0" , "B1" , "B2" , "B3"]
})
right = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] ,
"C" : ["C0" , "C1" , "C2" , "C3"],
"D" : ["D0" , "D1" , "D2" , "D3"]
})
print(left)
print(right)# merge 用于一组key值的情况
res = pd.merge(left , right , on = "key")
print(res)# 多组key值合并的时候
left = pd.DataFrame({"key1" : ["K0" , "K0" , "K1" , "K2"] ,
"key2" : ["K0" , "K1" , "K0" , "K1"] ,
"A" : ["A0" , "A1" , "A2" , "A3"],
"B" : ["B0" , "B1" , "B2" , "B3"]
})
right = pd.DataFrame({"key1" : ["K0" , "K1" , "K1" , "K2"] ,
"key2" : ["K0" , "K0" , "K0" , "K0"] ,
"C" : ["C0" , "C1" , "C2" , "C3"],
"D" : ["D0" , "D1" , "D2" , "D3"]
})
#print(left)
#print(right)# 按照column合并
res = pd.merge(left , right , on = ["key1" , "key2"] , how = "inner")
print(res)
res1 = pd.merge(left , right , on = ["key1" , "key2"] , how = "outer")
print(res1)
res2 = pd.merge(left , right , on = ["key1" , "key2"] , how = "left")# 以left为标准,左边全部,右边只要是有的就都算进来
print(res2)
res3 = pd.merge(left , right , on = ["key1" , "key2"] , how = "right")
print(res3)# indicator = True 将合并的记录放在新的一列
df1 = pd.DataFrame({"col1" : [0 , 1] , "col_left" : ["a" , "b"]})
df2 = pd.DataFrame({"col1" : [1 , 2 , 2] , "col_right" : [2 , 2 , 2]})print(df1)
print(df2)#根据col1进行合并,并启用indicator = True,并增加一列合并的记录
res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = True)
print(res)#还可以自定义合并列记录的名称
res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = "Result")
print(res)# 根据index合并
left = pd.DataFrame({
"A" : ["A0" , "A1" , "A2"],
"B" : ["B0" , "B1" , "B2"]},
index = ["K0" , "K1" , "K2"]
)
right = pd.DataFrame({
"C" : ["C0" , "C2" , "C3"],
"D" : ["D0" , "D2" , "D3"]},
index = ["K0" , "K2" , "K3"]
)
#print(left)
#print(right)# 根据index合并
res = pd.merge(left , right , left_index = True , right_index = True , how = "outer")
print(res)
res = pd.merge(left , right , left_index = True , right_index = True , how = "inner")
print(res)#解决overlapping(重叠)的问题:添加后缀:suffixes
boys = pd.DataFrame({"K" : ["K0" , "K1" , "K2"] , "age" : [1 , 2 , 3]})
girls = pd.DataFrame({"K" : ["K0" , "K0" , "K3"] , "age" : [4 , 5 , 6]})
res = pd.merge(boys , girls , on = "K" , suffixes = ["_boy" , "_girl"] , how = "inner")
print(res)
利用 matplotlib 作图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt# Series可视化
data = https://www.it610.com/article/pd.Series(np.random.randn(1000) , index = np.arange(1000))
data.cumsum()# 累加这个函数
data.plot() #可视化
plt.show()# DataFrame可视化
data = pd.DataFrame(np.random.randn(1000 , 4) , index = np.arange(1000) , columns = list("ABCD"))
data.cumsum()
#data.plot()
#plt.show()# scatter散点图
ax = data.plot.scatter(x = "A" , y = "B" , color = "DarkBlue" , label = "Class")
data.plot.scatter(x = "A" , y = "C" , color = "LightGreen" , label = "Class2" , ax = ax)
plt.show()