基于邻域粗糙集算法python实现

还在跑程序,我这渣笔记本CPU估计要跑很久,不如来贴个代码吧。
这个是之前一篇粗糙集相对属性约简python实现的一个小小改进
当中距离函数使用的是欧几里得距离公式,入坑没多久,代码写的不是很好,请见谅。

# _*_coding:utf-8 _*_ #@Time:2018/12/22 上午10:45 #@Author:we2swing #@FileName: test.pyimport pandas as pd import numpy as np import math import matplotlib.pyplot as pltdef basic_set(df): basic = {} for i in df.drop_duplicates().values.tolist(): basic[str(i)] = [] for j, k in enumerate(df.values.tolist()): if k == i: basic[str(i)].append(j)return basicdef Euclidean(x_data): n = x_data.shape[1]#代表列数 m = x_data.shape[0]#代表行数 num = [0] * n num2 = np.zeros((m,m)) #转为list arr = np.array(x_data)counti,countj,countk,countz = 0,0,0,0 #遍历每一行的元素 for j in arr: countk = 0 for k in arr: if countj != countk: countz = 0 # 遍历每一行的每一个元素 for z in k: temp = np.square(z - j[countz]) if countz == 0: num[countz] = temp else: #把每一行中每个元素的差值平方相加 num[countz] = num[countz-1] + temp countz = countz + 1 else: num[countz-1] = 0 num2[countj][countk] = round(math.sqrt(num[countz-1]),1) countk = countk + 1 countj = countj + 1 return num2def key_basic(num,k): # k = 4# 设δ=k # k = [4.0,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0] num1 = {} counti, countj = 0, 0 for i in num: num1[counti] = [] countj = 0 for j in i: if j <= k: num1[counti].append(countj) countj = countj + 1 counti = counti + 1 return num1def compare(data): # k = [4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0] global t1 t1 = 4.0 data = https://www.it610.com/article/data.dropna(axis=0, how='any') x_data = https://www.it610.com/article/data.drop(['judge'], axis=1) y_data = https://www.it610.com/article/data.loc[:,'judge'] n = x_data.shape[1]# 代表列数 m = x_data.shape[0]# 代表行数# 决策属性基本集 y_basic_set = sorted([v for k, v in basic_set(y_data).items()]) num = Euclidean(x_data) # print(num) x_basic_set = [v for k, v in key_basic(num,t1).items()]# γC(D) pos = [] for i in x_basic_set: for j in y_basic_set: if set(i).issubset(j): pos.append(i) # pos.sort()#排序 r_x_y = len(pos) / len(data)# 也许可以写一个card函数 print('依赖度r_x_(y):', r_x_y)# 计算每一个γ(C-a)(D) # 获取条件属性个数为总下标存入集合 # 收集属性重要性 imp_attr = [] columns_num = list(range(len(x_data.columns))) for i in columns_num: c = columns_num.copy() c.remove(i) u = data.iloc[:, c] num = Euclidean(u) u = sorted([v for k, v in key_basic(num,t1).items()])# γC(D) pos_va = [] for k in u: for j in y_basic_set: if set(k).issubset(j): pos_va.append(k) r_x_y_2 = len(pos_va) / len(data) r_diff = round((r_x_y - r_x_y_2), 4) imp_attr.append(r_diff)dict_imp = {} for o, p in enumerate(imp_attr): dict_imp[data.columns[o]] = presult = dict_imp # print(imp_attr) return resultdef sepdata(data): # 获取数据长度 len = data.iloc[:, 0].size # 将数据划分 if len % 100 != 0: if len > 100: num = len // 100 + 1 else: num = 1 else: if len > 100: num = int(len / 100) else: num = 1 arr = [[]] * numcount = 0 for i in arr: # 如果数少于100或者最后一部分数少于100,则放入一个由数长决定的数组 if num == 1: arr[count] = data.iloc[0:len]# 取100开始,取 elif count == num - 1: arr[count] = data.iloc[100 * count:len] else: arr[count] = data.iloc[100 * count:(count + 1) * 100] count = count + 1 sorted_dict_imp = [[]] * num total = [0] * 27 title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27'] # total = [0] * 16 # title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16'] print(sorted_dict_imp) count = 0 for i in arr: print('-------------------------------------第%d个数据集数据-----------------------------------------' % (count + 1)) sorted_dict_imp[count] = compare(i) count = count + 1 count1 = 0 # 将dict的key为C1-Cn的value存入total中保存,并且相加 for i in sorted_dict_imp: count = 0 if count1 == 0: for j in title: total[count] = i.get(j) count = count + 1 else: for z in title: total[count] = i.get(z) + total[count] count = count + 1 count1 = count1 + 1 # 输出最终C1-Cn的结果 count = 0 for i in title: print(i, ':', round(total[count], 4)) count = count + 1 figure(total)def figure(opt): count = 0 plt.rcParams['font.sans-serif'] = ['SimHei']# 用来正常显示中文标签 title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27'] plt.scatter(title, opt, c='b', s=7) plt.plot(title, opt, linewidth=3, alpha=0.7) plt.show()def main(): data = https://www.it610.com/article/pd.read_csv(filepath_or_buffer='/Users/we2swing/Desktop/111.csv') sepdata(data)if __name__ == '__main__': main()

【基于邻域粗糙集算法python实现】里面很多代码延续的是之前上面传送门里文章的代码。
觉得有用的话点个赞
测试数据:
链接:https://pan.baidu.com/s/1dtle6A6od8bOCwDPW1lZ2w
提取码:o2qf

    推荐阅读