【机器学习】pandas基础铺垫机器学习竞赛

什么是 Pandas？ Numpy 的一个数据处理包，方便做大规模数据处理和整理，有点类似数据库。

基本数据结构

import pandas as pd import numpy as np# Series：索引在左边，值在右边。若没有创建索引，则默认0 ~ N-1索引 s = pd.Series([1 , 3 , np.nan , 44 , 1]) # np.nan表示空值 print(s)# dataframe是一个表格型的数据结构，包含一组有序的行和列，每列是不同的值类型，有行索引，也有列索引。 dates = pd.date_range("20160101" , periods = 6) df = pd.DataFrame(np.random.randn(6 , 4) , index = dates , columns = ['a' , 'b' , 'c' , 'd'])#行 index，列 column print(df) print(df['b'])#不给定行列标签，默认以0123输出 df1 = pd.DataFrame(np.arange(12).reshape((3 , 4))) print(df1)df2 = pd.DataFrame({'A' : 1. , 'B' : pd.Timestamp("20130102") , 'C' : pd.Series(1 , index = list(range(4)) , dtype = "float32"), 'D' : np.array([3] * 4 , dtype = "float32"), 'E' : pd.Categorical(["test" , "train" , "test" , "train"]),# 类别标签，将类别信息转化成数值信息 'F' : "foo"})print(df2) print(df2.dtypes) print(pd.Categorical(df2['E']).codes)# 利用codes将categorical数据转化为编号，数据型转化成数值型 print(df2.index , df2.columns) print(df2.values)#只看数据 print(df2.describe())# 数据总结：针对这个表格的一些数据分析 print(df2.transpose())#没有sort_column，只有sort_index print(df2.sort_index(axis = 1 , ascending = False)) #False 代表降序，axis = 1代表按行排序 print(df2.sort_index(axis = 0 , ascending = False)) #False 代表降序，axis = 0代表按列排序#对值进行排序 print(df2.sort_values(by = 'B'))

pandas 选择数据

import pandas as pd import numpy as npdates = pd.date_range("20130101" , periods = 6) df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ['A' , 'B' , 'C' , 'D'])# 选择数据 print(df['A']) print(df[0 : 3]) print(df["20130102" : "20130104"])# 根据标签loc print(df.loc["20130102"]) print(df.loc[: , ["A" , "B"]]) print(df.loc["20130102" , ["A" , "B"]])#根据序列 iloc print(df.iloc[3 , 1]) print(df.iloc[3 : 5 , 1 : 3]) print(df.iloc[[1 , 3 , 5] , 1 : 3])# 混合ix : 固定两列 + 三行 print(df.ix[ : 3 , ["A" , "C"]])#判断筛选 print(df[df.B > 10])

pandas 设置值

import pandas as pd import numpy as np# 创建数据 dates = pd.date_range("20130101" , periods = 6) df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"])# 赋值 df.iloc[2 , 2] = 1111 df.loc["20130101" , "B"] = 222 df.B[df.A > 4] = 0# 若A的数值 > 4,则更改B的数为 0# 初始化按列初始批处理数据 df["F"] = np.nan print(df)# 添加数据：不同的数据 df["E"] = pd.Series([1 , 2 , 3 , 4 , 5 , 6] , index = pd.date_range("20130101" , periods = 6)) print(df)

pandas 处理丢失数据

import pandas as pd import numpy as np# 初始化这些数据为空 dates = pd.date_range("20130101" , periods = 6) df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"]) df.iloc[0 , 1] = np.nan df.iloc[1 , 2] = np.nan# dropna() 去掉有 np.nan 的行和列 df.dropna(axis = 0 , how = "any")# "any"只要存在 nan 就 drop 掉，"all" 全是 nan 才 drop 掉 print(df)# fillna() 将 nan 用其他数值代替，比如 0 df.fillna(value = https://www.it610.com/article/0)# isnull() 判断是否有 nan 存在，如果有用 True 表示 df.isnull() np.any(df.isnull()) == True

pandas 导入导出

import pandas as pd import numpy as np# 读取 csv文件 data = https://www.it610.com/article/pd.read_csv("F:/train_set.csv") print(data)# 保存成 pickle(一种数据化文件格式，缩写 pkl ) data.to_pickle("F:/grade123.pickle")

【【机器学习】pandas基础铺垫】
concat 数据合并

import pandas as pd import numpy as np# pandas合并concat df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd']) df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd']) df3 = pd.DataFrame(np.ones((3 , 4)) * 2 , columns = ['a' , 'b' , 'c' , 'd'])# concat纵向合并,ignore_index重置参数，让index可以保持下来 res = pd.concat([df1 , df2 , df3] , axis = 0) res = pd.concat([df1 , df2 , df3] , axis = 0 , ignore_index = True) print(res)# join合并方式 df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd']) df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['b' , 'c' , 'd' , 'e'])# 纵向"外"合并 df1和 df2 res = pd.concat([df1 , df2] , axis = 0 , join = 'outer')#outer 所有的column并在一起，如果没有的用 nan补齐 res1 = pd.concat([df1 , df2] , axis = 0 , join = 'inner')#inner 相同的column交在一起 print(res) print(res1)#join_axes 按照axes合并 df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd']) df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd']) df3 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd']) s1 = pd.Series([1 , 2 , 3 , 4] , index = ['a' , 'b' , 'c' , 'd'])res = pd.concat([df1 , df2] , axis = 1) #行并起来，不足的地方nan补齐 print(res)# append：只能纵向合并，将df2合并到df1下面，重置index res1 = df1.append(df2 , ignore_index = True) print(res1) res2 = df1.append([df2 , df3] , ignore_index = True)#也可以合并多个 print(res2)res3 = df1.append(s1 , ignore_index = True) #Series加入一行 print(res3)

merge 数据合并

import pandas as pdleft = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] , "A" : ["A0" , "A1" , "A2" , "A3"], "B" : ["B0" , "B1" , "B2" , "B3"] }) right = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] , "C" : ["C0" , "C1" , "C2" , "C3"], "D" : ["D0" , "D1" , "D2" , "D3"] }) print(left) print(right)# merge 用于一组key值的情况 res = pd.merge(left , right , on = "key") print(res)# 多组key值合并的时候 left = pd.DataFrame({"key1" : ["K0" , "K0" , "K1" , "K2"] , "key2" : ["K0" , "K1" , "K0" , "K1"] , "A" : ["A0" , "A1" , "A2" , "A3"], "B" : ["B0" , "B1" , "B2" , "B3"] }) right = pd.DataFrame({"key1" : ["K0" , "K1" , "K1" , "K2"] , "key2" : ["K0" , "K0" , "K0" , "K0"] , "C" : ["C0" , "C1" , "C2" , "C3"], "D" : ["D0" , "D1" , "D2" , "D3"] }) #print(left) #print(right)# 按照column合并 res = pd.merge(left , right , on = ["key1" , "key2"] , how = "inner") print(res) res1 = pd.merge(left , right , on = ["key1" , "key2"] , how = "outer") print(res1) res2 = pd.merge(left , right , on = ["key1" , "key2"] , how = "left")# 以left为标准，左边全部，右边只要是有的就都算进来 print(res2) res3 = pd.merge(left , right , on = ["key1" , "key2"] , how = "right") print(res3)# indicator = True 将合并的记录放在新的一列 df1 = pd.DataFrame({"col1" : [0 , 1] , "col_left" : ["a" , "b"]}) df2 = pd.DataFrame({"col1" : [1 , 2 , 2] , "col_right" : [2 , 2 , 2]})print(df1) print(df2)#根据col1进行合并，并启用indicator = True，并增加一列合并的记录 res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = True) print(res)#还可以自定义合并列记录的名称 res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = "Result") print(res)# 根据index合并 left = pd.DataFrame({ "A" : ["A0" , "A1" , "A2"], "B" : ["B0" , "B1" , "B2"]}, index = ["K0" , "K1" , "K2"] ) right = pd.DataFrame({ "C" : ["C0" , "C2" , "C3"], "D" : ["D0" , "D2" , "D3"]}, index = ["K0" , "K2" , "K3"] ) #print(left) #print(right)# 根据index合并 res = pd.merge(left , right , left_index = True , right_index = True , how = "outer") print(res) res = pd.merge(left , right , left_index = True , right_index = True , how = "inner") print(res)#解决overlapping(重叠)的问题：添加后缀：suffixes boys = pd.DataFrame({"K" : ["K0" , "K1" , "K2"] , "age" : [1 , 2 , 3]}) girls = pd.DataFrame({"K" : ["K0" , "K0" , "K3"] , "age" : [4 , 5 , 6]}) res = pd.merge(boys , girls , on = "K" , suffixes = ["_boy" , "_girl"] , how = "inner") print(res)

利用 matplotlib 作图

import pandas as pd import numpy as np import matplotlib.pyplot as plt# Series可视化 data = https://www.it610.com/article/pd.Series(np.random.randn(1000) , index = np.arange(1000)) data.cumsum()# 累加这个函数 data.plot() #可视化 plt.show()# DataFrame可视化 data = pd.DataFrame(np.random.randn(1000 , 4) , index = np.arange(1000) , columns = list("ABCD")) data.cumsum() #data.plot() #plt.show()# scatter散点图 ax = data.plot.scatter(x = "A" , y = "B" , color = "DarkBlue" , label = "Class") data.plot.scatter(x = "A" , y = "C" , color = "LightGreen" , label = "Class2" , ax = ax) plt.show()