1+ # -*- coding: utf-8 -*-
2+ # Author: XuMing <[email protected] > 3+ # Brief:
4+ def init_c1 (data_set_dict , min_support ):
5+ c1 = []
6+ freq_dic = {}
7+ for trans in data_set_dict :
8+ for item in trans :
9+ freq_dic [item ] = freq_dic .get (item , 0 ) + data_set_dict [trans ]
10+ # 优化初始的集合,使不满足最小支持度的直接排除
11+ c1 = [[k ] for (k , v ) in freq_dic .items () if v >= min_support ]
12+ c1 .sort ()
13+ return map (frozenset , c1 )
14+
15+
16+ def scan_data (data_set , ck , min_support , freq_items ):
17+ """
18+ 计算Ck中的项在数据集合中的支持度,剪枝过程
19+ :param data_set:
20+ :param ck:
21+ :param min_support: 最小支持度
22+ :param freq_items: 存储满足支持度的频繁项集
23+ :return:
24+ """
25+ ss_cnt = {}
26+ # 每次遍历全体数据集
27+ for trans in data_set :
28+ for item in ck :
29+ # 对每一个候选项集, 检查是否是 term中的一部分(子集),即候选项能否得到支持
30+ if item .issubset (trans ):
31+ ss_cnt [item ] = ss_cnt .get (item , 0 ) + 1
32+ ret_list = []
33+ for key in ss_cnt :
34+ support = ss_cnt [key ] # 每个项的支持度
35+ if support >= min_support :
36+ ret_list .insert (0 , key ) # 将满足最小支持度的项存入集合
37+ freq_items [key ] = support #
38+ return ret_list
39+
40+
41+ def apriori_gen (lk , k ):
42+ """
43+ 由Lk的频繁项集生成新的候选项集 连接过程
44+ :param lk: 频繁项集集合
45+ :param k: k 表示集合中所含的元素个数
46+ :return: 候选项集集合
47+ """
48+ ret_list = []
49+ for i in range (len (lk )):
50+ for j in range (i + 1 , len (lk )):
51+ l1 = list (lk [i ])[:k - 2 ]
52+ l2 = list (lk [j ])[:k - 2 ]
53+ l1 .sort ()
54+ l2 .sort ()
55+ if l1 == l2 :
56+ ret_list .append (lk [i ] | lk [j ]) # 求并集
57+ # retList.sort()
58+ return ret_list
59+
60+
61+ def apriori_zc (data_set , data_set_dict , min_support = 5 ):
62+ """
63+ Apriori算法过程
64+ :param data_set: 数据集
65+ :param min_support: 最小支持度,默认值 0.5
66+ :return:
67+ """
68+ c1 = init_c1 (data_set_dict , min_support )
69+ data = map (set , data_set ) # 将dataSet集合化,以满足scanD的格式要求
70+ freq_items = {}
71+ l1 = scan_data (data , c1 , min_support , freq_items ) # 构建初始的频繁项集
72+ l = [l1 ]
73+ # 最初的L1中的每个项集含有一个元素,新生成的项集应该含有2个元素,所以 k=2
74+ k = 2
75+ while len (l [k - 2 ]) > 0 :
76+ ck = apriori_gen (l [k - 2 ], k )
77+ lk = scan_data (data , ck , min_support , freq_items )
78+ l .append (lk )
79+ k += 1 # 新生成的项集中的元素个数应不断增加
80+ return freq_items
0 commit comments