金融反欺诈模型

整个项目做下来,感受最深的其实是如何从众多的特征中选出最实用的特征变量。
数据处理 ,下面通过代码来感受这个过程

#数据读取,skiprows的含义表示跳过第一行,从第二行开始读取 data = https://www.it610.com/article/pd.read_csv("G:\data\LoanStats_2016Q2\LoanStats_2016Q2.csv",skiprows=1,low_memory = True)

#删去缺失数据较多的列 data.drop("id",axis=1,inplace=True); data.drop("member_id",axis=1,inplace=True)

#截取每条数据中的数字部分 data.term.replace(to_replace='[^0-9]+',valuehttps://www.it610.com/article/= "",inplace = True, regex = True) data.int_rate.replace("%",valuehttps://www.it610.com/article/= "",inplace = True)

#删除状态过多的文本列,状态过多会导致后面做哑变量编码时产生大量的数据,建议先先删除 data.drop("sub_grade",axis=1,inplace=True) data.drop("emp_title",axis=1,inplace=True)

#处理工作年限:替换空值为np.nan,提取数值部分 data.emp_length.replace("n/a",np.nan,inplace = True) data.emp_length.replace(to_replace='[^0-9]+',value=https://www.it610.com/article/"",inplace = True,regex = True)

#删除全部为空的列,all:全部为空才匹配;any:任意一个单元格为空都匹配 # axis=1表示已列为单位,axis=0表示以行为单位 data.dropna(axis=1,how="all",inplace=True) #删除全部为空的行 data.dropna(axis=0,how="all",inplace=True)

#统计所有列的非空信息 print(data.info(verbose=True,null_counts = True)) #批量删除下面的列,下面的列大部分的数据都是空的,因此将他们删除 data.drop(["hardship_type","hardship_reason","hardship_status","deferral_term","hardship_amount","hardship_start_date",\ "hardship_end_date","payment_plan_start_date","hardship_length","hardship_dpd","hardship_loan_status",\ "orig_projected_additional_accrued_interest","hardship_payoff_balance_amount","hardship_last_payment_amount",\ "debt_settlement_flag_date","settlement_status","settlement_date","settlement_amount","settlement_percentage",\ "settlement_term"],axis=1,inplace=True)

#计算数据中列和列之间的关联度 cor = data.corr() #取矩阵中下半部的部分,cor.iloc一行一行的记录 cor.iloc[:,:]= np.tril(cor,k = -1); cor = cor.stack()#把所有行堆成一列 #print(cor[cor>0.95])#筛选出大于0.95的记录 #删除关联度高的多方中的一方 data.drop(["funded_amnt","funded_amnt_inv","out_prncp_inv","total_pymnt_inv","total_rec_prncp",\ "collection_recovery_fee","num_rev_tl_bal_gt_0","num_sats",\ "tot_hi_cred_lim","total_il_high_credit_limit"],axis=1,inplace=True)

#对于类型是object的列,如果类型太少或者太多,都可以将它删掉 for col in data.select_dtypes(include = ["object"]).columns: #print(len(data[col].unique())) print("col {} has {}".format(col,len(data[col].unique()))) #删除类型过少或者过多的列 data.drop([ "grade","home_ownership","verification_status","issue_d","pymnt_plan", "desc","zip_code","initial_list_status","next_pymnt_d","application_type","verification_status_joint", "hardship_flag","disbursement_method","debt_settlement_flag","earliest_cr_line","revol_util"],axis=1,inplace=True)

#因变量处理,这里我们暂时只考虑2分类问题,所以只保留Fully Paid,Charged Off data.loan_status.replace("Fully Paid",value = https://www.it610.com/article/int(1),inplace = True) data.loan_status.replace("Charged Off",value = https://www.it610.com/article/int(0),inplace = True) data.loan_status.replace("Current",value = https://www.it610.com/article/np.nan,inplace = True) data.loan_status.replace("Late (31-120 days)",value = https://www.it610.com/article/np.nan,inplace = True) data.loan_status.replace("In Grace Period",value = https://www.it610.com/article/np.nan,inplace = True) data.loan_status.replace("Late (16-30 days)",value = https://www.it610.com/article/np.nan,inplace = True) data.loan_status.replace("Default",value = https://www.it610.com/article/np.nan,inplace = True) data.dropna(subset=["loan_status"],axis=0,how="any",inplace=True) #用0.0来填充np.nan data.fillna(0.0,inplace=True)

#亚编码 data = https://www.it610.com/article/pd.get_dummies(data) data.to_csv("G:\data\LoanStats_2016Q2\LoanStats_2016Q2_3.csv")

模型构建及预测 【金融反欺诈模型】下面实用逻辑回归来金融发欺诈模型的构建
path = "G:\data\LoanStats_2016Q2\LoanStats_2016Q2_3.csv" data = https://www.it610.com/article/pd.read_csv(path) Y = data.loan_status X = data.drop("loan_status",axis=1,inplace=False) x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)lr = LogisticRegression() lr.fit(x_train,y_train) test_predict = lr.predict(x_test)print(metrics.accuracy_score(test_predict,y_test)) print(metrics.recall_score(test_predict,y_test))

    推荐阅读