Pandas数据分析基础流程及操作

【Pandas数据分析基础流程及操作】Pandas基础操作
准备: 导入库?

import pandas as pd # 导入pandas库并简写为pd import numpy as np # 导入numpy库并简写为np

一、数据导入
pd.read_csv(filename) # 导入csv格式文件中的数据 pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据 pd.read_excel(filename) # 导入Excel格式文件中的数据 pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据 pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据 pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据 pd.read_clipboard() # 导入系统粘贴板里面的数据 pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。

实际数据导入
data = https://www.it610.com/article/pd.read_csv('D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv') # D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv 为本地csv文件目录

二、数据导出
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中 df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中 df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中 df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中

三、数据结构及创建 Series
1、通过list创建
from pandas import Series,DataFrame data0 = [1,2,3,4] s = Series(data0,index=['a','b','c','d']) print(s)

a1 b2 c3 d4 dtype: int64

2、通过数组创建
s1 = np.array(['a','s','d','f','g']) print(s1)

['a' 's' 'd' 'f' 'g']

3、通过字典创建
data1 = {'a':1,'b':2,'c':3,'d':4} s2 = pd.Series(data1,index=['a','d','e','b']) print(s2)

a1.0 d4.0 eNaN b2.0 dtype: float64

上结果索引顺序保持不变,缺少的元素使用NAN(不是数字)填充 DataFrame
1、从列表创建DataFrame a、通过单个列表或列表字典创建
data3 = [1,2,3,4,5,6] s3 = pd.DataFrame(data3) print(s3)

0 01 12 23 34 45 56

data4 = [['lijie',20],['zhangsan',45],['wangwu',30]] s4 = pd.DataFrame(data4,columns=['name','age']) print(s4)

nameage 0lijie20 1zhangsan45 2wangwu30

b、通过字典列表创建
data5 = [{'a':1,'b':2},{'c':5,'d':8,'e':10}] s5 = pd.DataFrame(data5) print(s5)

abcde 01.02.0NaNNaNNaN 1NaNNaN5.08.010.0

2、从ndarrays/Lists的字典来创建DataFrame
data6 = {'name':['tom','lose','steve','bob'],'age':[12,21,30,14]} s6 = pd.DataFrame(data6) print(s6)

nameage 0tom12 1lose21 2steve30 3bob14

3、从系列字典创建DataFrame
data7 = {'one':pd.Series([1,2,3],index = ['a','b','c']), 'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])} s7 = pd.DataFrame(data7) print(s7)

onetwo a1.01 b2.02 c3.03 dNaN4

四、数据查看 1、基础属性
a、Series
s8 = Series([1,2,3,4,5]) print(s8)

01 12 23 34 45 dtype: int64

s8.axes # 返回行轴标签列表

[RangeIndex(start=0, stop=5, step=1)]

s8.dtype # 返回对象的数据类型

dtype('int64')

s8.empty # 如果系列为空,则返回True

False

s8.ndim # 返回底层数据的维数

1

s8.size # 返回基础数据中的元素数

5

s8.values # 将系列作为ndarray返回 array([1, 2, 3, 4, 5], dtype=int64) s8.head() # 返回前n行

01 12 23 34 45 dtype: int64

s8.tail() # 返回最后n行

01 12 23 34 45 dtype: int64

b、DataFrame
data9 = {'one':pd.Series([1,2,3],index = ['a','b','c']), 'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])} s9 = pd.DataFrame(data9) print(s9)

onetwo a1.01 b2.02 c3.03 dNaN4

s9.T # 转置行与列

abcd one 1.0 2.0 3.0 NaN two 1.0 2.0 3.0 4.0

s9.axes # 返回一个列,行轴标签和列轴标签作为唯一的成员。

[Index(['a', 'b', 'c', 'd'], dtype='object'), Index(['one', 'two'], dtype='object')]

s9.dtypes # 返回此对象中的数据类型(dtypes)

onefloat64 twoint64 dtype: object

s9.empty # 如果NDFrame完全为空[无项目],则返回为True; 如果任何轴的长度为0

False

s9.ndim # 轴/数组维度大小

2

s9.shape # 返回表示DataFrame的维度的元组

(4, 2)

s9.size # NDFrame中的元素数

8

s9.values # NDFrame的Numpy表示

array([[ 1.,1.], [ 2.,2.], [ 3.,3.], [nan,4.]])

s9.index # 查看索引

Index(['a', 'b', 'c', 'd'], dtype='object')

s9.columns # 查看列名

Index(['one', 'two'], dtype='object')

s9.info() # 查看数据框的索引、数据类型以及内存信息

Index: 4 entries, a to d Data columns (total 2 columns): #ColumnNon-Null CountDtype ---------------------------- 0one3 non-nullfloat64 1two4 non-nullint64 dtypes: float64(1), int64(1) memory usage: 256.0+ bytes

2、描述统计
data10 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack', 'Lee','David','Gasper','Betina','Andres']), 'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]), 'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])} s10 = pd.DataFrame(data10) print(s10)

NameAgeRating 0Tom254.23 1James263.24 2Ricky253.98 3Vin232.56 4Steve303.20 5Minsu294.60 6Jack233.80 7Lee343.78 8David402.98 9Gasper304.80 10Betina514.10 11Andres463.65

s10.sum() # 求和

NameTomJamesRickyVinSteveMinsuJackLeeDavidGasperBe... Age382 Rating44.92 dtype: object

s10.count() # 非空元素

Name12 Age12 Rating12 dtype: int64

s10.min() # 最小值

NameAndres Age23 Rating2.56 dtype: object

s10.max() # 最大值

NameVin Age51 Rating4.8 dtype: object

s10.cumsum() # 累计求和

NameAge Rating 0Tom 254.23 1TomJames517.47 2TomJamesRicky7611.45 3TomJamesRickyVin9914.01 4TomJamesRickyVinSteve129 17.21 5TomJamesRickyVinSteveMinsu158 21.81 6TomJamesRickyVinSteveMinsuJack181 25.61 7TomJamesRickyVinSteveMinsuJackLee215 29.39 8TomJamesRickyVinSteveMinsuJackLeeDavid255 32.37 9TomJamesRickyVinSteveMinsuJackLeeDavidGasper285 37.17 10TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...336 41.27 11TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...382 44.92

s10['Age'].mean() # 年龄平均值

31.833333333333332

s10.describe() # 统计信息摘要

Age Rating count12.00000012.000000 mean31.8333333.743333 std 9.2326820.661628 min 23.0000002.560000 25% 25.0000003.230000 50% 29.5000003.790000 75% 35.5000004.132500 max 51.0000004.800000

五、数据清洗 1、重命名
s.columns = ['a','b'] # 重命名数据框的列名称 data.head(5) # data为最开始导入的csv数据

12268318 2520377 pv1511544070 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 414606018.02735466.0NaN 1.511616e+09

data.columns = ["user_id","item_id","catagory_id","behavior_type","timestamp"] data.head(5)

user_id item_id catagory_id behavior_typetimestamp 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 414606018.02735466.0NaN 1.511616e+09

s.rename() # 重命名列/行,传入的函数或字典值必须是1对1的,没有包含在字典或者Series中的标签将保持原来的名称。字典中包含df中没有的标签,不会报错

s11 = data.rename(columns={'user_id':'a','item_id':'b','catagory_id':'c','behavior_type':'d','timestamp':'e'}) s11.head()

abcde 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 414606018.02735466.0NaN 1.511616e+09

s.set_index(),可以设置单索引和复合索引

s12 = data.set_index('user_id') # 将user_id作为索引(单索引) s12

item_id catagory_id behavior_typetimestamp user_id 12333346.02520771.0pv1.511562e+09 12576651.0149192.0pv1.511573e+09 12576651.0149192.0pv1.511573e+09 14365585.02520377.0pv1.511596e+09 14606018.02735466.0NaN 1.511616e+09 ... ... ... ... ... 100 4182583.01258177.0pv1.511751e+09 100 2337874.01194311.0pv1.511751e+09 100 3658601.02342116.0pv1.511758e+09 100 5153036.02342116.0pv1.511759e+09 100 598929.02429887.0buy 1.511759e+09 100 rows × 4 columns

s13 = data.set_index(['user_id','item_id']) # 将user_id作为索引(复合索引) s13

catagory_id behavior_typetimestamp user_id item_id 12333346.02520771.0pv1.511562e+09 2576651.0149192.0pv1.511573e+09 2576651.0149192.0pv1.511573e+09 4365585.02520377.0pv1.511596e+09 4606018.02735466.0NaN 1.511616e+09 ... ... ... ... ... 100 4182583.01258177.0pv1.511751e+09 2337874.01194311.0pv1.511751e+09 3658601.02342116.0pv1.511758e+09 5153036.02342116.0pv1.511759e+09 598929.02429887.0buy 1.511759e+09 100 rows × 3 columns

2、重复值
duplicated() 查找重复值
data.duplicated()

0False 1False 2False 3False 4False ... 95False 96False 97False 98False 99False Length: 100, dtype: bool

drop_duplicate() 删除重复值
data.drop_duplicates()

user_id item_id catagory_id behavior_typetimestamp 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 414606018.02735466.0NaN 1.511616e+09 ... ... ... ... ... ... 95100 4182583.01258177.0pv1.511751e+09 96100 2337874.01194311.0pv1.511751e+09 97100 3658601.02342116.0pv1.511758e+09 98100 5153036.02342116.0pv1.511759e+09 99100 598929.02429887.0buy 1.511759e+09 100 rows × 5 columns

3、空值、缺失值
data.isnull() # True为空,反之非空

user_id item_id catagory_id behavior_typetimestamp 0FalseFalseFalseFalseFalse 1FalseFalseFalseFalseFalse 2FalseFalseFalseFalseFalse 3FalseFalseFalseFalseFalse 4FalseFalseFalseTrueFalse ... ... ... ... ... ... 95FalseFalseFalseFalseFalse 96FalseFalseFalseFalseFalse 97FalseFalseFalseFalseFalse 98FalseFalseFalseFalseFalse 99FalseFalseFalseFalseFalse 100 rows × 5 columns

data.isnull().values==True # 可用data.isnull().values==True来定位 array([[False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False,True, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False,True], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False,True, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False,True, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False,True], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False]])

data.notnull() # true为非空,反之为空

user_id item_id catagory_id behavior_typetimestamp 0TrueTrueTrueTrueTrue 1TrueTrueTrueTrueTrue 2TrueTrueTrueTrueTrue 3TrueTrueTrueTrueTrue 4TrueTrueTrueFalseTrue ... ... ... ... ... ... 95TrueTrueTrueTrueTrue 96TrueTrueTrueTrueTrue 97TrueTrueTrueTrueTrue 98TrueTrueTrueTrueTrue 99TrueTrueTrueTrueTrue 100 rows × 5 columns

data.fillna(0) # 用0填充空值

user_id item_id catagory_id behavior_typetimestamp 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 414606018.02735466.001.511616e+09 ... ... ... ... ... ... 95100 4182583.01258177.0pv1.511751e+09 96100 2337874.01194311.0pv1.511751e+09 97100 3658601.02342116.0pv1.511758e+09 98100 5153036.02342116.0pv1.511759e+09 99100 598929.02429887.0buy 1.511759e+09 100 rows × 5 columns

data.dropna(how='any', inplace=True) # 删除 含空值的行

4、字符处理
空格
data['behavior_type'].value_counts()

pv87 fav5 buy3 Name: behavior_type, dtype: int64

data.replace('p','pv')

user_id item_id catagory_id behavior_typetimestamp 012333346.02520771.0pv1.511562e+09 112576651.0149192.0pv1.511573e+09 212576651.0149192.0pv1.511573e+09 314365585.02520377.0pv1.511596e+09 51230380.0411153.0pv1.511645e+09 ... ... ... ... ... ... 95100 4182583.01258177.0pv1.511751e+09 96100 2337874.01194311.0pv1.511751e+09 97100 3658601.02342116.0pv1.511758e+09 98100 5153036.02342116.0pv1.511759e+09 99100 598929.02429887.0buy 1.511759e+09 95 rows × 5 columns

去除空格
data['behavior_type'].map(str.strip) # 去除两端空格

0pv 1pv 2pv 3pv 5pv ... 95pv 96pv 97pv 98pv 99buy Name: behavior_type, Length: 95, dtype: object

data['behavior_type'].map(str.lstrip) # 去除左端空格

0pv 1pv 2pv 3pv 5pv ... 95pv 96pv 97pv 98pv 99buy Name: behavior_type, Length: 95, dtype: object

data['behavior_type'].map(str.rstrip) # 去除右端空格

0pv 1pv 2pv 3pv 5pv ... 95pv 96pv 97pv 98pv 99buy Name: behavior_type, Length: 95, dtype: object

大小写
data['behavior_type'].map(str.upper) # 大写

0PV 1PV 2PV 3PV 5PV ... 95PV 96PV 97PV 98PV 99BUY Name: behavior_type, Length: 95, dtype: object

data['behavior_type'].map(str.lower) # 小写

0pv 1pv 2pv 3pv 5pv ... 95pv 96pv 97pv 98pv 99buy Name: behavior_type, Length: 95, dtype: object

data['behavior_type'].map(str.title) # 首字母大写

0Pv 1Pv 2Pv 3Pv 5Pv ... 95Pv 96Pv 97Pv 98Pv 99Buy Name: behavior_type, Length: 95, dtype: object

data['behavior_type'].value_counts()

pv87 fav5 buy3 Name: behavior_type, dtype: int64

# data.replace('Pv','PV') 替换

5、异常和极端值
发现方法: a、可通过describe()方法,观察统计的各项数据,发现异常值 b、借助散点图、箱型图、正态分布图等进行异常值的发现 处理办法 a、删除——先将异常值替换为na,然后用dropna()删除 b、视为缺失值——先将异常值替换为na,然后用缺失值处理方法处理(填充,插值等)

6、更改数据格式
data['timestamp'].dtype

dtype('float64')

data['timestamp'].astype('int32') # 将 'float64' 转化为 'int32'

01511561733 11511572885 21511572886 31511596146 51511644942 ... 951511750991 961511751022 971511757958 981511758581 991511759458 Name: timestamp, Length: 95, dtype: int32

data['date'] = pd.to_datetime(data['timestamp'])

data['date']

01970-01-01 00:00:01.511561733 11970-01-01 00:00:01.511572885 21970-01-01 00:00:01.511572886 31970-01-01 00:00:01.511596146 51970-01-01 00:00:01.511644942 ... 951970-01-01 00:00:01.511750991 961970-01-01 00:00:01.511751022 971970-01-01 00:00:01.511757958 981970-01-01 00:00:01.511758581 991970-01-01 00:00:01.511759458 Name: date, Length: 95, dtype: datetime64[ns]

8、分列
str.split(self,pat = None,n = -1,expand = False )

pat str,可选:要分割的字符串或正则表达式。如果未指定,请在空白处分割。 n int,默认为-1(全部):限制输出的分割数。 None,则0和-1将被解释为返回所有拆分。 expand,默认为False:将拆分的字符串展开为单独的列。如果为True,则返回DataFrame / MultiIndex扩展维; 如果为False,则返回包含字符串列表的Series / Index。

s14 = pd.Series(["this is a regular sentence","https://docs.python.org/3/tutorial/index.html",np.nan]) s14

0this is a regular sentence 1https://docs.python.org/3/tutorial/index.html 2NaN dtype: object

s14.str.split()

0[this, is, a, regular, sentence] 1[https://docs.python.org/3/tutorial/index.html] 2NaN dtype: object

s14.str.rsplit()

0[this, is, a, regular, sentence] 1[https://docs.python.org/3/tutorial/index.html] 2NaN dtype: object

s14.str.split(n=2)

0[this, is, a regular sentence] 1[https://docs.python.org/3/tutorial/index.html] 2NaN dtype: object

s14.str.rsplit(n=2)

0[this is a, regular, sentence] 1[https://docs.python.org/3/tutorial/index.html] 2NaN dtype: object

s14.str.rsplit(pat='/') 0[this is a regular sentence] 1[https:, , docs.python.org, 3, tutorial, index... 2NaN dtype: object

s14.str.split(pat='/',expand=True)

012345 0this is a regular sentenceNoneNoneNoneNoneNone 1https:docs.python.org 3tutorialindex.html 2NaN NaN NaN NaN NaN NaN

六、数据预处理
1、数据分组、分段 groupby() 分组
data.groupby(['behavior_type']).count() # 按照'behavior_type'对data进行分组并计数

user_id item_id catagory_id timestampdate behavior_type buy 33333 fav 55555 pv8787878787

data.groupby(['behavior_type','item_id']).sum() # 按照'behavior_type'和'item_id'对data进行分组并求和

user_id catagory_id timestamp behavior_typeitem_id buy 598929.0100 2429887.01.511759e+09 1603476.0100 2951233.01.511580e+09 2971043.0100 4869428.01.511618e+09 fav 2158340.0100 2429887.01.511743e+09 2971043.0100 4869428.01.511565e+09 ... ... ... ... ... pv4954999.031233459.04.536499e+09 4973305.012520771.01.511969e+09 5002615.025040754.03.023684e+09 5100093.0100 2945933.01.511552e+09 5153036.0100 2342116.01.511759e+09 74 rows × 3 columns

cut()/pcut() 分段
score_list = np.random.randint(25, 100, size=20) # 分数列表 print(score_list)

[38 62 72 26 60 50 61 86 38 97 72 47 61 59 83 73 48 56 59 38]

bins = [0, 59, 70, 80, 100] # 指定分数区间 print(bins)

[0, 59, 70, 80, 100]

score_cut = pd.cut(score_list, bins) # 对分数按照分数区间进行分段 print(score_cut)

[(0, 59], (59, 70], (70, 80], (0, 59], (59, 70], ..., (70, 80], (0, 59], (0, 59], (0, 59], (0, 59]] Length: 20 Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]

pd.value_counts(score_cut) # 查看各分数区间的人数

(0, 59]10 (59, 70]4 (80, 100]3 (70, 80]3 dtype: int64

2、 筛选、排序 sort_values() 按列进行排序
data.sort_values('user_id',ascending=True) # 按照'user_id'对data进行升序排序,ascending为False则为降序

user_id item_id catagory_id behavior_typetimestampdate 012333346.02520771.0pv1.511562e+091970-01-01 00:00:01.511561733 3013157558.02520771.0pv1.511985e+091970-01-01 00:00:01.511984943 3112087357.02131531.0pv1.512005e+091970-01-01 00:00:01.512004568 3214170517.0149192.0pv1.512011e+091970-01-01 00:00:01.512011019 3311340922.04690421.0pv1.512041e+091970-01-01 00:00:01.512041260 ... ... ... ... ... ... ... 72100 2971043.04869428.0buy 1.511618e+091970-01-01 00:00:01.511617549 73100 2337874.01194311.0pv1.511680e+091970-01-01 00:00:01.511680485 74100 4182583.01258177.0pv1.511681e+091970-01-01 00:00:01.511680521 76100 4919803.02881542.0pv1.511684e+091970-01-01 00:00:01.511684032 99100 598929.02429887.0buy 1.511759e+091970-01-01 00:00:01.511759458 95 rows × 6 columns

data.sort_values(['user_id','item_id'],ascending=[True,False])

user_id item_id catagory_id behavior_typetimestampdate 1415002615.02520377.0pv1.511839e+091970-01-01 00:00:01.511839385 1615002615.02520377.0pv1.511844e+091970-01-01 00:00:01.511844273 2814973305.02520771.0pv1.511969e+091970-01-01 00:00:01.511969365 3714954999.0411153.0pv1.512061e+091970-01-01 00:00:01.512061318 4214954999.0411153.0pv1.512173e+091970-01-01 00:00:01.512172732 ... ... ... ... ... ... ... 70100 1603476.02951233.0buy 1.511580e+091970-01-01 00:00:01.511579908 58100 1220136.04869428.0pv1.511551e+091970-01-01 00:00:01.511550908 65100 704268.0223690.0pv1.511564e+091970-01-01 00:00:01.511563606 92100 598929.02429887.0pv1.511743e+091970-01-01 00:00:01.511742899 99100 598929.02429887.0buy 1.511759e+091970-01-01 00:00:01.511759458 95 rows × 6 columns

sort_index() 按行进行排序 3、 连接和组合 append()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3]) df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'], 'C': ['C4', 'C5', 'C6', 'C7'], 'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7]) df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等

ABCD 0A0B0C0D0 1A1B1C1D1 2A2B2C2D2 3A3B3C3D3 4A4B4C4D4 5A5B5C5D5 6A6B6C6D6 7A7B7C7D7

concat()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3]) df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'], 'C': ['C4', 'C5', 'C6', 'C7'], 'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7]) pd.concat([df1,df2],axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等

ABCDABCD 0A0B0C0D0NaN NaN NaN NaN 1A1B1C1D1NaN NaN NaN NaN 2A2B2C2D2NaN NaN NaN NaN 3A3B3C3D3NaN NaN NaN NaN 4NaN NaN NaN NaN A4B4C4D4 5NaN NaN NaN NaN A5B5C5D5 6NaN NaN NaN NaN A6B6C6D6 7NaN NaN NaN NaN A7B7C7D7

join()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'key': ['K0', 'K1', 'K0', 'K1']})df2 = pd.DataFrame({'C': ['C0', 'C1'], 'D': ['D0', 'D1']}, index=['K0', 'K1'])df1.join(df2, on='key') # 对数据框df1和df2做内连接,其中连接的列为col1

ABkey CD 0A0B0K0C0D0 1A1B1K1C1D1 2A2B2K0C0D0 3A3B3K1C1D1

merge()
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) result = pd.merge(left, right, on='key') print(result)

keyABCD 0K0A0B0C0D0 1K1A1B1C1D1 2K2A2B2C2D2 3K3A3B3C3D3

七、数据索引、提取
df[col] 以数组 Series 的形式返回选取的列

df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE')) df['C']

00.340454 10.116156 20.234191 30.516992 40.354863 Name: C, dtype: float64

df[[col1,col2]] 以新的数据框(DataFrame)的形式返回选取的列

df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE')) df[['B','E']]

BE 00.3136190.848954 10.5718870.337651 20.6668880.086449 30.4236150.368037 40.4539950.479666

df.loc['index_one'] 按照索引选取

s = pd.Series(np.array(['I','Love','Data'])) s.loc[1]

'Love'

df.iloc[0] 按照位置选取

df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE')) df.iloc[0,:] # 选取第一行

A0.960768 B0.433873 C0.777235 D0.799733 E0.643273 Name: 0, dtype: float64

df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE')) df.iloc[0,0] # 选取第一行第一个元素

0.7353611931210813

时间序列 八、数据统计
描述统计
data.describe()

user_id item_id catagory_id timestamp count95.0000009.500000e+019.500000e+019.500000e+01 mean45.8105262.831879e+062.494733e+061.511826e+09 std 49.5387871.398492e+061.421917e+062.213639e+05 min 1.0000004.625900e+041.491920e+051.511551e+09 25% 1.0000002.087357e+061.289235e+061.511682e+09 50% 1.0000002.791761e+062.465336e+061.511751e+09 75% 100.0000003.875118e+063.213828e+061.511980e+09 max 100.0000005.153036e+064.869428e+061.512282e+09

data.groupby(['behavior_type']).count().mean()

user_id31.666667 item_id31.666667 catagory_id31.666667 timestamp31.666667 date31.666667 dtype: float64

更多见数据查看 函数应用
apply:应用在DataFrame的行或列中 applymap:应用在DataFrame的每个元素中 map:应用在单独一列(Series)的每个元素中

九、数据透视表
pd.pivot_table()

    推荐阅读