python|python apriori

小修改+注释

""" # Python 2.7 # Filename: apriori.py # Author: llhthinker # Email: hangliu56[AT]gmail[DOT]com # Blog: http://www.cnblogs.com/llhthinker/p/6719779.html # Date: 2017-04-16 """ """ data_set = list[list[]] L = list[set(frozenset())] support_data = https://www.it610.com/article/dic{frozenset()} = value (support count) C1 = set(frozenset()) Lk = set(frozenset()) item_count = dic{frozenset()} Lksub1 = set(frozenset()) Ck_item = frozenset() Ck = set(frozenset())""" #return a list(list) def load_data_set(): """ Load a sample data set (From Data Mining: Concepts and Techniques, 3th Edition) Returns: A data set: A list of transactions. Each transaction contains several items. """ data_set = [['s1', 's2', 's5'], ['s2', 's4'], ['s2', 's3'], ['s1', 's2', 's4'], ['s1', 's3'], ['s2', 's3'], ['s1', 's3'], ['s1', 's2', 's3', 's5'], ['s1', 's2', 's3']] """ the type of the data_set is list of list----------------------------------------------- """ return data_set#return a set(frozenset) def create_C1(data_set): """ Create frequent candidate 1-itemset C1 by scaning data_set. Args: data_set: A list of transactions. Each transaction contains several items. Returns: C1: A set which contains all frequent candidate 1-itemsets """ """ The explain of frozenset :http://www.cnblogs.com/panwenbin-logs/p/5519617.html """ C1 = set() for t in data_set: for item in t: item_set = frozenset([item]) #print(type(item_set),item_set) C1.add(item_set) #print(C1) return C1#return a bool -> just judge**step of pruning** def is_apriori(Ck_item, Lksub1): """ Judge whether a frequent candidate k-itemset satisfy Apriori property. Args: Ck_item: a frequent candidate k-itemset in Ck which contains all frequent candidate k-itemsets. Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets. Returns: True: satisfying Apriori property. False: Not satisfying Apriori property. """ for item in Ck_item:#Ck_item is only frozenset which contains only one element(set). #print("aaa") #print(item)#str #print('bbb') #print(Ck_item)# #print(type(Ck_item)) #print("origin") #print(Ck_item) sub_Ck = Ck_item - frozenset([item])#sub_Ck is (k-1)-itemsets #print("after pruning") #print(sub_Ck) if sub_Ck not in Lksub1: #print("xxx") #print(sub_Ck) return False return True#return a set(frozenset())**step of connection** def create_Ck(Lksub1, k): """ Create Ck, a set which contains all all frequent candidate k-itemsets by Lk-1's own connection operation. Args: Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets. k: the item number of a frequent itemset. Return: Ck: a set which contains all all frequent candidate k-itemsets. """ Ck = set() len_Lksub1 = len(Lksub1)#the numbers of the (k-1)-itemsets #print(len_Lksub1) list_Lksub1 = list(Lksub1)#transform (k-1)-itemsets of the set into list #print(list_Lksub1) for i in range(len_Lksub1): for j in range(i+1, len_Lksub1): l1 = list(list_Lksub1[i])#list of the list l2 = list(list_Lksub1[j]) l1.sort() l2.sort() #print(l1) #print(l2) if l1[0:k-2] == l2[0:k-2]: Ck_item = list_Lksub1[i] | list_Lksub1[j]#connecting list( two (k-1)-itemsets ) #print("xxx") #print(Ck_item) #print(list_Lksub1) -------------- #print(type(Ck_item)) #print(type(list_Lksub1))#process -> list_Lk = list_1 | list_2 -> tranform list_LK into Ck_item #elsepruning if is_apriori(Ck_item, Lksub1): Ck.add(Ck_item) #print(Ck) #print(type(Ck)) return Ck#return a set(frozenset)**scaning the data set** def generate_Lk_by_Ck(data_set, Ck, min_support, support_data): """ Generate Lk by executing a delete policy from Ck. Args: data_set: A list of transactions. Each transaction contains several items. Ck: A set which contains all all frequent candidate k-itemsets. min_support: The minimum support. support_data: A dictionary. The key is frequent itemset and the value is support. Returns: Lk: A set which contains all all frequent k-itemsets. """ Lk = set() item_count = {} for t in data_set:# t represent a transation for item in Ck:#item represent a candidate k-itemsets """ print(type(item))class->frozenset print(item)->frozenset({'l2'}),which can be the key of the dictionary print(type(t))class->list print(t)[lx,lx,...lx] """ if item.issubset(t):# the set of item is the subset of the list of t #print("Yes") if item not in item_count: item_count[item] = 1 else: item_count[item] += 1 # else: #print("No")t_num = float(len(data_set)) # total numbers of transations for item in item_count: if (item_count[item] / t_num) >= min_support: Lk.add(item) #print(Lk) support_data[item] = item_count[item] #/ t_num return Lk#return L = list(set(frozenset)) , support_data = https://www.it610.com/article/dic() def generate_L(data_set, k, min_support):""" Generate all frequent itemsets. Args: data_set: A list of transactions. Each transaction contains several items. k: Maximum number of items for all frequent itemsets. min_support: The minimum support. Returns: L: The list of Lk. support_data: A dictionary. The key is frequent itemset and the value is support. """ support_data = https://www.it610.com/article/{} C1 = create_C1(data_set) L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data) Lksub1 = L1.copy() #print(Lksub1) L = [] L.append(Lksub1) #print(L) for i in range(2, k+1): Ci = create_Ck(Lksub1, i) Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data) Lksub1 = Li.copy() L.append(Lksub1)#every time append a set(frozenset) where contain k-itemsets return L, support_datadef generate_big_rules(L, support_data, min_conf):""" Generate big rules from frequent itemsets. Args: L: The list of Lk. support_data: A dictionary. The key is frequent itemset and the value is support. min_conf: Minimal confidence. Returns: big_rule_list: A list which contains all big rules. Each big rule is represented as a 3-tuple. """ big_rule_list = [] sub_set_list = [] for i in range(0, len(L)): for freq_set in L[i]: for sub_set in sub_set_list: if sub_set.issubset(freq_set): conf = support_data[freq_set] / support_data[freq_set - sub_set] big_rule = (freq_set - sub_set, sub_set, conf) if conf >= min_conf and big_rule not in big_rule_list: # print freq_set-sub_set, " => ", sub_set, "conf: ", conf big_rule_list.append(big_rule) sub_set_list.append(freq_set) return big_rule_listif __name__ == "__main__": """ Test """ data_set = load_data_set()#load dataL, support_data = https://www.it610.com/article/generate_L(data_set, k=3, min_support=0.2)for Lk in L: print ("="*50) print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport") print ("="*50) for freq_set in Lk: print (freq_set, support_data[freq_set]) print ()""" big_rules_list = generate_big_rules(L, support_data, min_conf=0.7) print ("Big Rules") for item in big_rules_list: print (item[0], "=>", item[1], "conf: ", item[2]) """


    推荐阅读