2019数据智能算法大赛个人baseline 2019数据智能算法大赛个人bas

2019数据智能算法大赛个人baseline

在google colaboratory上使用
导入相应的包
数据的读入

数据探索性分析 pandas_profiling

特征工程

数据预处理

查看缺失值，并且缺失的个数要从低到高排序
缺失值处理
将字符类型转换为数值类型

特征提取

建模训练预测

【2019数据智能算法大赛个人baseline】
在google colaboratory上使用

from google.colab import drive drive.mount('/content/drive')

导入相应的包

import numpy as np import pandas as pd import matplotlib.pyplot as pltfrom datetime import timedelta import pandas_profiling as ppf import datetimefrom sklearn.model_selection import train_test_split import lightgbm as lgbimport warnings warnings.filterwarnings('ignore') %matplotlib inline

数据的读入

full = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DC/round1_diac2019_train.csv') full.head()

# 先输入数据的基本信息。 # 这是一个好习惯，可以对数据的规模、各个特征的数据类型以及是否有缺失等，有一个总体的了解。 print(full.info())

数据探索性分析 pandas_profiling

ppf.ProfileReport(full)

特征工程数据预处理查看缺失值，并且缺失的个数要从低到高排序

# 统计出空值的个数 miss = full.isnull().sum() # 由低到高进行排序 miss[miss>0].sort_values(ascending=True)

缺失值处理

# 记录待预测的所有用户id # drop_duplicates去除重复项 dropna()滤除缺失数据 all_customer = pd.DataFrame(full[['customer_id']]).drop_duplicates(['customer_id']).dropna()# order_total_amount = order_total_payment + order_total_discount full["order_total_discount"] = full["order_amount"] - full["order_total_payment"] full_neg_order_total_discount = full.loc[full.order_total_discount <0, 'order_total_discount'] full.loc[full.order_total_discount <0, 'order_total_discount'] = 0 - full_neg_order_total_discount full.loc[full.order_amount




 
# 对字符类型的缺失值进行填充
cols = ["customer_province" , "customer_city"]
for col in cols:
full[col].fillna("None",inplace=True)# 对数值类型的缺失值进行填充
cols1=["is_customer_rate", "customer_gender", "is_member_actived"]
for col in cols1:
full[col].fillna(0, inplace=True)# 对这些列用众数进行填充
cols2 = ["customer_id","member_status"]
for col in cols2:
full[col].fillna(full[col].mode()[0], inplace=True)



 
将字符类型转换为数值类型


 
# 针对customer_province处理
customer_province_cold = ['香港特别行政区','台湾','柔佛']
customer_province_hot1 = ['广东省','浙江省','江苏省','上海','北京','None']
customer_province_hot2 = ['四川省','湖北省','山东省','河南省','福建省','安徽省','湖南省']
customer_province_hot3 = ['辽宁省','江西省','重庆','河北省','陕西省','云南省']
customer_province_hot4 = ['广西壮族自治区','天津','黑龙江省','贵州省','吉林省',
'山西省','内蒙古自治区','新疆维吾尔自治区','甘肃省',
'海南省','西藏自治区','宁夏回族自治区','青海省']for province_col_cold in customer_province_cold:
full.loc[full.customer_province == province_col_cold, 'customer_province'] = 0for province_col_hot1 in customer_province_hot1:
full.loc[full.customer_province == province_col_hot1, 'customer_province'] = 1for province_col_hot2 in customer_province_hot2:
full.loc[full.customer_province == province_col_hot2, 'customer_province'] = 2for province_col_hot3 in customer_province_hot3:
full.loc[full.customer_province == province_col_hot3, 'customer_province'] = 3for province_col_hot4 in customer_province_hot4:
full.loc[full.customer_province == province_col_hot4, 'customer_province'] = 4



 
# 针对customer_city处理
full_customer_city = full['customer_city'].drop_duplicates().dropna().values.tolist()
customer_city_hot1 = ['上海市','None','北京市','广州市','杭州市','成都市','武汉市','深圳市']
customer_city_hot2 = ['南京市','重庆市','苏州市','天津市','长沙市','郑州市','西安市']
customer_city_hot3 = ['合肥市','宁波市','福州市','昆明市','温州市','东莞市','青岛市',
'沈阳市','厦门市','佛山市','南昌市','无锡市','济南市','哈尔滨市']
customer_city_cold = list(set(full_customer_city) - set(customer_city_hot1) - set(customer_city_hot2) - set(customer_city_hot3))for city_col_hot1 in customer_city_hot1:
full.loc[full.customer_city == city_col_hot1, 'customer_city'] = 1for city_col_hot2 in customer_city_hot2:
full.loc[full.customer_city == city_col_hot2, 'customer_city'] = 2for city_col_hot3 in customer_city_hot3:
full.loc[full.customer_city == city_col_hot3, 'customer_city'] = 3for city_col_cold in customer_city_cold:
full.loc[full.customer_city == city_col_cold, 'customer_city'] = 0



 
from sklearn.preprocessing import LabelEncoder#标签编码
lab = LabelEncoder()#full["customer_province"] = lab.fit_transform(full.customer_province).astype('int64')
#full["customer_city"] = lab.fit_transform(full.customer_city).astype('int64')# 浮点转为整型
full["customer_city"] = full["customer_city"].astype('int64')
full["customer_gender"] = full["customer_gender"].astype('int64')
full["member_status"] = full["member_status"].astype('int64')
full["is_member_actived"] = full["is_member_actived"].astype('int64')
full["is_customer_rate"] = full["is_customer_rate"].astype('int64')
full["customer_id"] = full["customer_id"].astype('int64')# 利用ID
full.loc[full.member_id !=0, 'member_id'] = 1



 
特征提取 
# 查看order_pay_time的范围
print('full date gap',full.order_pay_time.min(),'->',full.order_pay_time.max()) 



 
full['order_pay_time'] = pd.to_datetime(full['order_pay_time'])# 转换日期格式 例：2013-01-01 14:25:30
full['order_pay_date'] = full['order_pay_time'].dt.date# 只取日期时间date部分 例：2013-01-01
# 取验证集开始的时间，即2013-07-04
validata_date_begin = full['order_pay_date'].max() - datetime.timedelta(days=180) # timedelta模块 时间增减用法# full前180天 order_pay_date 的数据
train_history = full[(full['order_pay_date'].astype(str)<='2013-07-03')]# astype() 实现变量类型转换
# full360天 order_pay_date 的数据
online_history = full[(full['order_pay_date'].astype(str)<='2013-12-31')]
# train_label 相对于 train_history 的未来180天的数据
train_label = full[full['order_pay_date'].astype(str)>='2013-07-04']



 
# 使数据有折扣和值为19.99，9.90，19.90，0.01—>order_total_discount = 1
train_history.loc[train_history.order_total_discount>0, 'order_total_discount'] = 1
online_history.loc[online_history.order_total_discount>0, 'order_total_discount'] = 1order_amount_nums = [19.99,9.90,19.90,0.01]
for order_amount_num in order_amount_nums:
train_history.loc[train_history.order_amount==order_amount_num, 'order_total_discount'] = 1
online_history.loc[online_history.order_amount==order_amount_num, 'order_total_discount'] = 1



 
# make label
train_history['label'] = 0
train_history.loc[train_history['customer_id'].isin(list(train_label['customer_id'].unique())),'label'] = 1# isin()接受一个列表，判断该列中元素是否在列表中。
train_history['label'].value_counts()



 
# 重新制作训练集和测试集的特征
def make_new_feature1(feature,isSubmit):
feature['count'] = 1
# 统计单个用户出现了多少次
customer_id_count = feature.groupby(['customer_id'],as_index=False)['count'].agg({'count':'count'})
# 统计单个用户第一次和最后一次购买订单时间
last_time = feature.groupby(['customer_id'],as_index=False)['order_pay_date'].agg({'order_pay_date_first':'min',
'order_pay_date_last':'max'})
# 合并以上数据
print('start to merge!')data = https://www.it610.com/article/pd.merge(customer_id_count,last_time,on=['customer_id'],how='left',copy=False)
# 记录购买间隔时间
data['pay_interval_time'] = pd.to_datetime(data['order_pay_date_last']) - pd.to_datetime(data['order_pay_date_first'])
data['pay_interval_time'] = data['pay_interval_time'].dt.days + 1
# 购买距离双十一时间
data['pay_11interval_time'] = pd.to_datetime('2013-11-11') - pd.to_datetime(data['order_pay_date_last'])
data['pay_11interval_time'] = data['pay_11interval_time'].dt.days + 1
del data['order_pay_date_first']
if isSubmit==False:
data['order_pay_date_last'] = pd.to_datetime('2013-07-03') - pd.to_datetime(data['order_pay_date_last'])
data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1
else:
data['order_pay_date_last'] = pd.to_datetime('2013-12-31') - pd.to_datetime(data['order_pay_date_last'])
data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1print(data.shape)
return data



 
# 生成训练数据和提交数据
train1 = make_new_feature1(train_history,False)
test1 = make_new_feature1(online_history,True)



 
# 超过双十一都改为距离下一年双十一的时间
pay_11interval_time_neg = test1.loc[test1.pay_11interval_time <0, 'pay_11interval_time']
test1.loc[test1.pay_11interval_time <0, 'pay_11interval_time'] = 365 + pay_11interval_time_neg



 
# 添加新的特征
def make_new_feature2(feature, train1):
# 性别
customer_gender = feature.groupby(['customer_id'],as_index=False)['customer_gender'].agg({'customer_gender':'max'}) # 用户所在省市
customer_province = feature.groupby(['customer_id'],as_index=False)["customer_province"].agg({'customer_province':'max'})
customer_city = feature.groupby(['customer_id'],as_index=False)["customer_city"].agg({'customer_city':'median'})# 支持会员折扣
goods_has_discount = feature.groupby(['customer_id'],as_index=False)["goods_has_discount"].agg({'goods_has_discount_count':'count'})
#'goods_has_discount_sum':'sum'})
#'goods_has_discount_min':'min'})# 用户评价的次数以及用户评价率
def evaluate_rate_computer(group):
return (group.sum()) / (group.count())
evaluate_count = feature.groupby(['customer_id'],as_index=False)["is_customer_rate"].agg({'evaluate_count':'sum',
'evaluate_rate':evaluate_rate_computer})# 用户是否是会员
is_member = feature.groupby(['customer_id'],as_index=False)["member_status"].agg({'is_member':'mean'})
#is_member = feature.groupby(['customer_id'],as_index=False)["member_id"].agg({'is_member':'max'})# 合并以上数据
print('start to merge!')data = https://www.it610.com/article/pd.merge(train1,customer_gender,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,customer_province,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,customer_city,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,goods_has_discount,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,evaluate_count,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,is_member,on=['customer_id'],how='left',copy=False)print(data.shape)
return data



 
# 生成训练数据和提交数据
train2 = make_new_feature2(train_history, train1)
test2 = make_new_feature2(online_history, test1)



 
def make_new_feature3(feature, train2):
# 统计这个用户购买商品的价格信息
good_price = feature.groupby(['customer_id'],as_index=False)['goods_price'].agg({'goods_price_max':'max',
'goods_price_min':'min',
'goods_price_mean':'mean'})
## 订单应付总金额
#order_amount = feature.groupby(['customer_id'],as_index=False)['order_amount'].agg({'order_amount_max':'max',
#'order_amount_min':'min',
#'order_amount_sum':'sum'})
# 订单优惠金额
order_total_discount = feature.groupby(['customer_id'],as_index=False)['order_total_discount'].agg({'order_total_discount_sum':'sum'})
##'order_total_discount_min':'min',
##'order_total_discount_max':'max'})# 合并以上数据
print('start to merge!')data = https://www.it610.com/article/pd.merge(train2,good_price,on=['customer_id'],how='left',copy=False)
#data = https://www.it610.com/article/pd.merge(data,order_amount,on=['customer_id'],how='left',copy=False)
data = https://www.it610.com/article/pd.merge(data,order_total_discount,on=['customer_id'],how='left',copy=False)print(data.shape)
return data



 
# 生成训练数据和提交数据
train3 = make_new_feature3(train_history, train2)
test3 = make_new_feature3(online_history, test2)



 
def make_label(labels,train3):
label = labels.groupby(['customer_id'],as_index=False)['label'].agg({'label':'max'})data = https://www.it610.com/article/pd.merge(train3,label,on=['customer_id'],how='left',copy=False)
print(data.shape)
return data



 
train = make_label(train_history,train3)
test = test3.copy()
print(test.shape)



 
train["customer_city"] = train["customer_city"].astype('int64')
test["customer_city"] = test["customer_city"].astype('int64')
train["order_total_discount_sum"] = train["order_total_discount_sum"].astype('int64')
test["order_total_discount_sum"] = test["order_total_discount_sum"].astype('int64')



 
建模训练预测 
# 构建机器学习所需的label和data
train_copy = train.copy()
y = train_copy.pop('label')
feature = [x for x in train_copy.columns if x not in ['customer_id']]
X = train_copy[feature]# 训练与测试数据标准化处理
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
X_scaled = ss_X.fit_transform(X)# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.5, random_state=33)



 
# 参数
param = {
'num_leaves':40,
'objective':'binary',
'max_depth':5,
'learning_rate':0.1,
'metric':'binary_logloss'}test_df = test[['customer_id']]
X_test = test[feature]
#
train_data = https://www.it610.com/article/lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)
lgbm = lgb.train(param,train_data,valid_sets=[train_data,valid_data],
num_boost_round = 10000 ,
early_stopping_rounds=250,
verbose_eval=50)
y_test = lgbm.predict(X_test)



 
print(pd.DataFrame({
'column':feature,
'importance':lgbm.feature_importance()
}).sort_values(by='importance'))



 
test_df['result'] = y_testall_customer_submit = pd.merge(all_customer,test_df,on=['customer_id'],how='left',copy=False)
all_customer_submit = all_customer_submit.sort_values(['customer_id'])
all_customer_submit['customer_id'] = all_customer_submit['customer_id'].astype('int64')
all_customer_submit['result'] = all_customer_submit['result'].fillna(0)
all_customer_submit.to_csv('./DC_2019results.csv',index=False)




		  	

    
    




    
    
    


推荐阅读

           
                  
              
                  四个字的微信网名 四个字的微信网名女 
                
                   
                
              
            

                  
              
                  还不懂Spring|还不懂Spring Boot启动原理的话，一定要看看！ 
                
                   
                
              
            

                  
              
                  不可理喻的爱情 
                
                   
                
              
            

                  
              
                  印度_社会|印度单日新增死亡超4000例！印度疫情亲历者：只有少数人戴口罩 
                
                   
                
              
            

                  
              
                  CAD调出菜单栏 CAD调出菜单栏的操作教程 
                
                   
                
              
            

                  
              
                  索尼HDR摄像机黑屏的故障原因 
                
                   
                
              
            

                  
              
                  鸡蛋放冰箱里好吗？影响鸡蛋新鲜度和食用安全的因素 
                
                   
                
              
            

                  
              
                  佳能废墨清零软件下载教程-请问佳能IP1980打印机清零软件那里有下载 
                
                   
                
              
            

                  
              
                  vivoy5s忘记锁屏密码怎么办 
                
                   
                
              
            

                  
              
                  纸短情长 歌词 纸短情长歌词完整版 
                
                   
                
              
            

                  
              
                  灌木的特点图片及简介 什么是灌木 
                
                   
                
              
            

                  
              
                  在哪关闭每月自动续费 
                
                   
                
              
            

                  
              
                  python 关联分析 svm,灰色关联度Python 
                
                   
                
              
            

                  
              
                  相机镜头小三元 佳能尼康小三元镜头 
                
                   
                
              
            

                  
              
                  为什么只有怀孕的北极熊能冬眠？ 北极为什么熊冬眠 
                
                   
                
              
            

                  
              
                  1946年属什么生肖属相 1946年属狗吗 
                
                   
                
              
            

                  
              
                  科举制度是从什么时候开始的 
                
                   
                
              
            

                  
              
                  如何在手机上删除服务器？ 手机怎么删除服务器 
                
                   
                
              
            

                  
              
                  软件编程月薪多少万，软件开发工资多少 
                
                   
                
              
            

                  
              
                  《上古卷轴5：天际》重制版不死生物打法介绍 不死生物都有哪些物种 
                
                   
                
              
            

          

Docker应用:容器间通信与Mariadb数据库主从复制 
 2019-02-13——今天谈梦想（） 
 20190302|20190302 复盘翻盘 
 2019年12月24日 
 2019.4.18感恩日记 
 2019.4.2咖啡冥想日记 
 2019-1-14 
 亲子日记（287）2019.4.27. 
 考前焦虑——接纳情绪，转移注意力 
 2019-01-17-晨读7期-直子Day25