Pandas数据分析基础流程及操作
【Pandas数据分析基础流程及操作】Pandas基础操作
准备:
导入库?
import pandas as pd # 导入pandas库并简写为pd
import numpy as np # 导入numpy库并简写为np
一、数据导入
pd.read_csv(filename) # 导入csv格式文件中的数据
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 导入Excel格式文件中的数据
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。
实际数据导入
data = https://www.it610.com/article/pd.read_csv('D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv')
# D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv 为本地csv文件目录
二、数据导出
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
三、数据结构及创建 Series
1、通过list创建
from pandas import Series,DataFrame
data0 = [1,2,3,4]
s = Series(data0,index=['a','b','c','d'])
print(s)
a1
b2
c3
d4
dtype: int64
2、通过数组创建
s1 = np.array(['a','s','d','f','g'])
print(s1)
['a' 's' 'd' 'f' 'g']
3、通过字典创建
data1 = {'a':1,'b':2,'c':3,'d':4}
s2 = pd.Series(data1,index=['a','d','e','b'])
print(s2)
a1.0
d4.0
eNaN
b2.0
dtype: float64
上结果索引顺序保持不变,缺少的元素使用NAN(不是数字)填充 DataFrame
1、从列表创建DataFrame a、通过单个列表或列表字典创建
data3 = [1,2,3,4,5,6]
s3 = pd.DataFrame(data3)
print(s3)
0
01
12
23
34
45
56
data4 = [['lijie',20],['zhangsan',45],['wangwu',30]]
s4 = pd.DataFrame(data4,columns=['name','age'])
print(s4)
nameage
0lijie20
1zhangsan45
2wangwu30
b、通过字典列表创建
data5 = [{'a':1,'b':2},{'c':5,'d':8,'e':10}]
s5 = pd.DataFrame(data5)
print(s5)
abcde
01.02.0NaNNaNNaN
1NaNNaN5.08.010.0
2、从ndarrays/Lists的字典来创建DataFrame
data6 = {'name':['tom','lose','steve','bob'],'age':[12,21,30,14]}
s6 = pd.DataFrame(data6)
print(s6)
nameage
0tom12
1lose21
2steve30
3bob14
3、从系列字典创建DataFrame
data7 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
s7 = pd.DataFrame(data7)
print(s7)
onetwo
a1.01
b2.02
c3.03
dNaN4
四、数据查看 1、基础属性
a、Series
s8 = Series([1,2,3,4,5])
print(s8)
01
12
23
34
45
dtype: int64
s8.axes # 返回行轴标签列表
[RangeIndex(start=0, stop=5, step=1)]
s8.dtype # 返回对象的数据类型
dtype('int64')
s8.empty # 如果系列为空,则返回True
False
s8.ndim # 返回底层数据的维数
1
s8.size # 返回基础数据中的元素数
5
s8.values # 将系列作为ndarray返回
array([1, 2, 3, 4, 5], dtype=int64)
s8.head() # 返回前n行
01
12
23
34
45
dtype: int64
s8.tail() # 返回最后n行
01
12
23
34
45
dtype: int64
b、DataFrame
data9 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
s9 = pd.DataFrame(data9)
print(s9)
onetwo
a1.01
b2.02
c3.03
dNaN4
s9.T # 转置行与列
abcd
one 1.0 2.0 3.0 NaN
two 1.0 2.0 3.0 4.0
s9.axes # 返回一个列,行轴标签和列轴标签作为唯一的成员。
[Index(['a', 'b', 'c', 'd'], dtype='object'),
Index(['one', 'two'], dtype='object')]
s9.dtypes # 返回此对象中的数据类型(dtypes)
onefloat64
twoint64
dtype: object
s9.empty # 如果NDFrame完全为空[无项目],则返回为True;
如果任何轴的长度为0
False
s9.ndim # 轴/数组维度大小
2
s9.shape # 返回表示DataFrame的维度的元组
(4, 2)
s9.size # NDFrame中的元素数
8
s9.values # NDFrame的Numpy表示
array([[ 1.,1.],
[ 2.,2.],
[ 3.,3.],
[nan,4.]])
s9.index # 查看索引
Index(['a', 'b', 'c', 'd'], dtype='object')
s9.columns # 查看列名
Index(['one', 'two'], dtype='object')
s9.info() # 查看数据框的索引、数据类型以及内存信息
Index: 4 entries, a to d
Data columns (total 2 columns):
#ColumnNon-Null CountDtype
----------------------------
0one3 non-nullfloat64
1two4 non-nullint64
dtypes: float64(1), int64(1)
memory usage: 256.0+ bytes
2、描述统计
data10 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
'Lee','David','Gasper','Betina','Andres']),
'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
s10 = pd.DataFrame(data10)
print(s10)
NameAgeRating
0Tom254.23
1James263.24
2Ricky253.98
3Vin232.56
4Steve303.20
5Minsu294.60
6Jack233.80
7Lee343.78
8David402.98
9Gasper304.80
10Betina514.10
11Andres463.65
s10.sum() # 求和
NameTomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...
Age382
Rating44.92
dtype: object
s10.count() # 非空元素
Name12
Age12
Rating12
dtype: int64
s10.min() # 最小值
NameAndres
Age23
Rating2.56
dtype: object
s10.max() # 最大值
NameVin
Age51
Rating4.8
dtype: object
s10.cumsum() # 累计求和
NameAge Rating
0Tom 254.23
1TomJames517.47
2TomJamesRicky7611.45
3TomJamesRickyVin9914.01
4TomJamesRickyVinSteve129 17.21
5TomJamesRickyVinSteveMinsu158 21.81
6TomJamesRickyVinSteveMinsuJack181 25.61
7TomJamesRickyVinSteveMinsuJackLee215 29.39
8TomJamesRickyVinSteveMinsuJackLeeDavid255 32.37
9TomJamesRickyVinSteveMinsuJackLeeDavidGasper285 37.17
10TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...336 41.27
11TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...382 44.92
s10['Age'].mean() # 年龄平均值
31.833333333333332
s10.describe() # 统计信息摘要
Age Rating
count12.00000012.000000
mean31.8333333.743333
std 9.2326820.661628
min 23.0000002.560000
25% 25.0000003.230000
50% 29.5000003.790000
75% 35.5000004.132500
max 51.0000004.800000
五、数据清洗 1、重命名
s.columns = ['a','b'] # 重命名数据框的列名称
data.head(5) # data为最开始导入的csv数据
12268318 2520377 pv1511544070
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
414606018.02735466.0NaN 1.511616e+09
data.columns = ["user_id","item_id","catagory_id","behavior_type","timestamp"]
data.head(5)
user_id item_id catagory_id behavior_typetimestamp
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
414606018.02735466.0NaN 1.511616e+09
s.rename() # 重命名列/行,传入的函数或字典值必须是1对1的,没有包含在字典或者Series中的标签将保持原来的名称。字典中包含df中没有的标签,不会报错
s11 = data.rename(columns={'user_id':'a','item_id':'b','catagory_id':'c','behavior_type':'d','timestamp':'e'})
s11.head()
abcde
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
414606018.02735466.0NaN 1.511616e+09
s.set_index(),可以设置单索引和复合索引
s12 = data.set_index('user_id') # 将user_id作为索引(单索引)
s12
item_id catagory_id behavior_typetimestamp
user_id
12333346.02520771.0pv1.511562e+09
12576651.0149192.0pv1.511573e+09
12576651.0149192.0pv1.511573e+09
14365585.02520377.0pv1.511596e+09
14606018.02735466.0NaN 1.511616e+09
... ... ... ... ...
100 4182583.01258177.0pv1.511751e+09
100 2337874.01194311.0pv1.511751e+09
100 3658601.02342116.0pv1.511758e+09
100 5153036.02342116.0pv1.511759e+09
100 598929.02429887.0buy 1.511759e+09
100 rows × 4 columns
s13 = data.set_index(['user_id','item_id']) # 将user_id作为索引(复合索引)
s13
catagory_id behavior_typetimestamp
user_id item_id
12333346.02520771.0pv1.511562e+09
2576651.0149192.0pv1.511573e+09
2576651.0149192.0pv1.511573e+09
4365585.02520377.0pv1.511596e+09
4606018.02735466.0NaN 1.511616e+09
... ... ... ... ...
100 4182583.01258177.0pv1.511751e+09
2337874.01194311.0pv1.511751e+09
3658601.02342116.0pv1.511758e+09
5153036.02342116.0pv1.511759e+09
598929.02429887.0buy 1.511759e+09
100 rows × 3 columns
2、重复值
duplicated() 查找重复值
data.duplicated()
0False
1False
2False
3False
4False
...
95False
96False
97False
98False
99False
Length: 100, dtype: bool
drop_duplicate() 删除重复值
data.drop_duplicates()
user_id item_id catagory_id behavior_typetimestamp
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
414606018.02735466.0NaN 1.511616e+09
... ... ... ... ... ...
95100 4182583.01258177.0pv1.511751e+09
96100 2337874.01194311.0pv1.511751e+09
97100 3658601.02342116.0pv1.511758e+09
98100 5153036.02342116.0pv1.511759e+09
99100 598929.02429887.0buy 1.511759e+09
100 rows × 5 columns
3、空值、缺失值
data.isnull() # True为空,反之非空
user_id item_id catagory_id behavior_typetimestamp
0FalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalse
4FalseFalseFalseTrueFalse
... ... ... ... ... ...
95FalseFalseFalseFalseFalse
96FalseFalseFalseFalseFalse
97FalseFalseFalseFalseFalse
98FalseFalseFalseFalseFalse
99FalseFalseFalseFalseFalse
100 rows × 5 columns
data.isnull().values==True # 可用data.isnull().values==True来定位
array([[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False,True, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False,True],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False,True, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False,True, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False,True],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False]])
data.notnull() # true为非空,反之为空
user_id item_id catagory_id behavior_typetimestamp
0TrueTrueTrueTrueTrue
1TrueTrueTrueTrueTrue
2TrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrue
4TrueTrueTrueFalseTrue
... ... ... ... ... ...
95TrueTrueTrueTrueTrue
96TrueTrueTrueTrueTrue
97TrueTrueTrueTrueTrue
98TrueTrueTrueTrueTrue
99TrueTrueTrueTrueTrue
100 rows × 5 columns
data.fillna(0) # 用0填充空值
user_id item_id catagory_id behavior_typetimestamp
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
414606018.02735466.001.511616e+09
... ... ... ... ... ...
95100 4182583.01258177.0pv1.511751e+09
96100 2337874.01194311.0pv1.511751e+09
97100 3658601.02342116.0pv1.511758e+09
98100 5153036.02342116.0pv1.511759e+09
99100 598929.02429887.0buy 1.511759e+09
100 rows × 5 columns
data.dropna(how='any', inplace=True) # 删除 含空值的行
4、字符处理
空格
data['behavior_type'].value_counts()
pv87
fav5
buy3
Name: behavior_type, dtype: int64
data.replace('p','pv')
user_id item_id catagory_id behavior_typetimestamp
012333346.02520771.0pv1.511562e+09
112576651.0149192.0pv1.511573e+09
212576651.0149192.0pv1.511573e+09
314365585.02520377.0pv1.511596e+09
51230380.0411153.0pv1.511645e+09
... ... ... ... ... ...
95100 4182583.01258177.0pv1.511751e+09
96100 2337874.01194311.0pv1.511751e+09
97100 3658601.02342116.0pv1.511758e+09
98100 5153036.02342116.0pv1.511759e+09
99100 598929.02429887.0buy 1.511759e+09
95 rows × 5 columns
去除空格
data['behavior_type'].map(str.strip) # 去除两端空格
0pv
1pv
2pv
3pv
5pv
...
95pv
96pv
97pv
98pv
99buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.lstrip) # 去除左端空格
0pv
1pv
2pv
3pv
5pv
...
95pv
96pv
97pv
98pv
99buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.rstrip) # 去除右端空格
0pv
1pv
2pv
3pv
5pv
...
95pv
96pv
97pv
98pv
99buy
Name: behavior_type, Length: 95, dtype: object
大小写
data['behavior_type'].map(str.upper) # 大写
0PV
1PV
2PV
3PV
5PV
...
95PV
96PV
97PV
98PV
99BUY
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.lower) # 小写
0pv
1pv
2pv
3pv
5pv
...
95pv
96pv
97pv
98pv
99buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.title) # 首字母大写
0Pv
1Pv
2Pv
3Pv
5Pv
...
95Pv
96Pv
97Pv
98Pv
99Buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].value_counts()
pv87
fav5
buy3
Name: behavior_type, dtype: int64
# data.replace('Pv','PV') 替换
5、异常和极端值
发现方法:
a、可通过describe()方法,观察统计的各项数据,发现异常值
b、借助散点图、箱型图、正态分布图等进行异常值的发现
处理办法
a、删除——先将异常值替换为na,然后用dropna()删除
b、视为缺失值——先将异常值替换为na,然后用缺失值处理方法处理(填充,插值等)
6、更改数据格式
data['timestamp'].dtype
dtype('float64')
data['timestamp'].astype('int32') # 将 'float64' 转化为 'int32'
01511561733
11511572885
21511572886
31511596146
51511644942
...
951511750991
961511751022
971511757958
981511758581
991511759458
Name: timestamp, Length: 95, dtype: int32
data['date'] = pd.to_datetime(data['timestamp'])
data['date']
01970-01-01 00:00:01.511561733
11970-01-01 00:00:01.511572885
21970-01-01 00:00:01.511572886
31970-01-01 00:00:01.511596146
51970-01-01 00:00:01.511644942
...
951970-01-01 00:00:01.511750991
961970-01-01 00:00:01.511751022
971970-01-01 00:00:01.511757958
981970-01-01 00:00:01.511758581
991970-01-01 00:00:01.511759458
Name: date, Length: 95, dtype: datetime64[ns]
8、分列
str.split(self,pat = None,n = -1,expand = False )
pat str,可选:要分割的字符串或正则表达式。如果未指定,请在空白处分割。
n int,默认为-1(全部):限制输出的分割数。 None,则0和-1将被解释为返回所有拆分。
expand,默认为False:将拆分的字符串展开为单独的列。如果为True,则返回DataFrame / MultiIndex扩展维;
如果为False,则返回包含字符串列表的Series / Index。
s14 = pd.Series(["this is a regular sentence","https://docs.python.org/3/tutorial/index.html",np.nan])
s14
0this is a regular sentence
1https://docs.python.org/3/tutorial/index.html
2NaN
dtype: object
s14.str.split()
0[this, is, a, regular, sentence]
1[https://docs.python.org/3/tutorial/index.html]
2NaN
dtype: object
s14.str.rsplit()
0[this, is, a, regular, sentence]
1[https://docs.python.org/3/tutorial/index.html]
2NaN
dtype: object
s14.str.split(n=2)
0[this, is, a regular sentence]
1[https://docs.python.org/3/tutorial/index.html]
2NaN
dtype: object
s14.str.rsplit(n=2)
0[this is a, regular, sentence]
1[https://docs.python.org/3/tutorial/index.html]
2NaN
dtype: object
s14.str.rsplit(pat='/')
0[this is a regular sentence]
1[https:, , docs.python.org, 3, tutorial, index...
2NaN
dtype: object
s14.str.split(pat='/',expand=True)
012345
0this is a regular sentenceNoneNoneNoneNoneNone
1https:docs.python.org 3tutorialindex.html
2NaN NaN NaN NaN NaN NaN
六、数据预处理
1、数据分组、分段 groupby() 分组
data.groupby(['behavior_type']).count() # 按照'behavior_type'对data进行分组并计数
user_id item_id catagory_id timestampdate
behavior_type
buy 33333
fav 55555
pv8787878787
data.groupby(['behavior_type','item_id']).sum() # 按照'behavior_type'和'item_id'对data进行分组并求和
user_id catagory_id timestamp
behavior_typeitem_id
buy 598929.0100 2429887.01.511759e+09
1603476.0100 2951233.01.511580e+09
2971043.0100 4869428.01.511618e+09
fav 2158340.0100 2429887.01.511743e+09
2971043.0100 4869428.01.511565e+09
... ... ... ... ...
pv4954999.031233459.04.536499e+09
4973305.012520771.01.511969e+09
5002615.025040754.03.023684e+09
5100093.0100 2945933.01.511552e+09
5153036.0100 2342116.01.511759e+09
74 rows × 3 columns
cut()/pcut() 分段
score_list = np.random.randint(25, 100, size=20) # 分数列表
print(score_list)
[38 62 72 26 60 50 61 86 38 97 72 47 61 59 83 73 48 56 59 38]
bins = [0, 59, 70, 80, 100] # 指定分数区间
print(bins)
[0, 59, 70, 80, 100]
score_cut = pd.cut(score_list, bins) # 对分数按照分数区间进行分段
print(score_cut)
[(0, 59], (59, 70], (70, 80], (0, 59], (59, 70], ..., (70, 80], (0, 59], (0, 59], (0, 59], (0, 59]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
pd.value_counts(score_cut) # 查看各分数区间的人数
(0, 59]10
(59, 70]4
(80, 100]3
(70, 80]3
dtype: int64
2、 筛选、排序 sort_values() 按列进行排序
data.sort_values('user_id',ascending=True) # 按照'user_id'对data进行升序排序,ascending为False则为降序
user_id item_id catagory_id behavior_typetimestampdate
012333346.02520771.0pv1.511562e+091970-01-01 00:00:01.511561733
3013157558.02520771.0pv1.511985e+091970-01-01 00:00:01.511984943
3112087357.02131531.0pv1.512005e+091970-01-01 00:00:01.512004568
3214170517.0149192.0pv1.512011e+091970-01-01 00:00:01.512011019
3311340922.04690421.0pv1.512041e+091970-01-01 00:00:01.512041260
... ... ... ... ... ... ...
72100 2971043.04869428.0buy 1.511618e+091970-01-01 00:00:01.511617549
73100 2337874.01194311.0pv1.511680e+091970-01-01 00:00:01.511680485
74100 4182583.01258177.0pv1.511681e+091970-01-01 00:00:01.511680521
76100 4919803.02881542.0pv1.511684e+091970-01-01 00:00:01.511684032
99100 598929.02429887.0buy 1.511759e+091970-01-01 00:00:01.511759458
95 rows × 6 columns
data.sort_values(['user_id','item_id'],ascending=[True,False])
user_id item_id catagory_id behavior_typetimestampdate
1415002615.02520377.0pv1.511839e+091970-01-01 00:00:01.511839385
1615002615.02520377.0pv1.511844e+091970-01-01 00:00:01.511844273
2814973305.02520771.0pv1.511969e+091970-01-01 00:00:01.511969365
3714954999.0411153.0pv1.512061e+091970-01-01 00:00:01.512061318
4214954999.0411153.0pv1.512173e+091970-01-01 00:00:01.512172732
... ... ... ... ... ... ...
70100 1603476.02951233.0buy 1.511580e+091970-01-01 00:00:01.511579908
58100 1220136.04869428.0pv1.511551e+091970-01-01 00:00:01.511550908
65100 704268.0223690.0pv1.511564e+091970-01-01 00:00:01.511563606
92100 598929.02429887.0pv1.511743e+091970-01-01 00:00:01.511742899
99100 598929.02429887.0buy 1.511759e+091970-01-01 00:00:01.511759458
95 rows × 6 columns
sort_index() 按行进行排序 3、 连接和组合 append()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
concat()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
pd.concat([df1,df2],axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等
ABCDABCD
0A0B0C0D0NaN NaN NaN NaN
1A1B1C1D1NaN NaN NaN NaN
2A2B2C2D2NaN NaN NaN NaN
3A3B3C3D3NaN NaN NaN NaN
4NaN NaN NaN NaN A4B4C4D4
5NaN NaN NaN NaN A5B5C5D5
6NaN NaN NaN NaN A6B6C6D6
7NaN NaN NaN NaN A7B7C7D7
join()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})df2 = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])df1.join(df2, on='key') # 对数据框df1和df2做内连接,其中连接的列为col1
ABkey CD
0A0B0K0C0D0
1A1B1K1C1D1
2A2B2K0C0D0
3A3B3K1C1D1
merge()
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
result = pd.merge(left, right, on='key')
print(result)
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3
七、数据索引、提取
df[col] 以数组 Series 的形式返回选取的列
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df['C']
00.340454
10.116156
20.234191
30.516992
40.354863
Name: C, dtype: float64
df[[col1,col2]] 以新的数据框(DataFrame)的形式返回选取的列
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df[['B','E']]
BE
00.3136190.848954
10.5718870.337651
20.6668880.086449
30.4236150.368037
40.4539950.479666
df.loc['index_one'] 按照索引选取
s = pd.Series(np.array(['I','Love','Data']))
s.loc[1]
'Love'
df.iloc[0] 按照位置选取
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df.iloc[0,:] # 选取第一行
A0.960768
B0.433873
C0.777235
D0.799733
E0.643273
Name: 0, dtype: float64
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df.iloc[0,0] # 选取第一行第一个元素
0.7353611931210813
时间序列 八、数据统计
描述统计
data.describe()
user_id item_id catagory_id timestamp
count95.0000009.500000e+019.500000e+019.500000e+01
mean45.8105262.831879e+062.494733e+061.511826e+09
std 49.5387871.398492e+061.421917e+062.213639e+05
min 1.0000004.625900e+041.491920e+051.511551e+09
25% 1.0000002.087357e+061.289235e+061.511682e+09
50% 1.0000002.791761e+062.465336e+061.511751e+09
75% 100.0000003.875118e+063.213828e+061.511980e+09
max 100.0000005.153036e+064.869428e+061.512282e+09
data.groupby(['behavior_type']).count().mean()
user_id31.666667
item_id31.666667
catagory_id31.666667
timestamp31.666667
date31.666667
dtype: float64
更多见数据查看 函数应用
apply:应用在DataFrame的行或列中
applymap:应用在DataFrame的每个元素中
map:应用在单独一列(Series)的每个元素中
九、数据透视表
pd.pivot_table()
推荐阅读
- Python基础|Python基础 - 练习1
- Java|Java基础——数组
- Python数据分析(一)(Matplotlib使用)
- Java基础-高级特性-枚举实现状态机
- 营养基础学20180331(课间随笔)??
- iOS面试题--基础
- HTML基础--基本概念--跟着李南江学编程
- typeScript入门基础介绍
- c++基础概念笔记
- 集体释放