Skip to content

Commit 73e5b88

Browse files
author
xuming06
committed
add frequent pattern
1 parent cc8cdf3 commit 73e5b88

9 files changed

Lines changed: 102858 additions & 0 deletions

File tree

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief:
4+
def init_c1(data_set_dict, min_support):
5+
c1 = []
6+
freq_dic = {}
7+
for trans in data_set_dict:
8+
for item in trans:
9+
freq_dic[item] = freq_dic.get(item, 0) + data_set_dict[trans]
10+
# 优化初始的集合,使不满足最小支持度的直接排除
11+
c1 = [[k] for (k, v) in freq_dic.items() if v >= min_support]
12+
c1.sort()
13+
return map(frozenset, c1)
14+
15+
16+
def scan_data(data_set, ck, min_support, freq_items):
17+
"""
18+
计算Ck中的项在数据集合中的支持度,剪枝过程
19+
:param data_set:
20+
:param ck:
21+
:param min_support: 最小支持度
22+
:param freq_items: 存储满足支持度的频繁项集
23+
:return:
24+
"""
25+
ss_cnt = {}
26+
# 每次遍历全体数据集
27+
for trans in data_set:
28+
for item in ck:
29+
# 对每一个候选项集, 检查是否是 term中的一部分(子集),即候选项能否得到支持
30+
if item.issubset(trans):
31+
ss_cnt[item] = ss_cnt.get(item, 0) + 1
32+
ret_list = []
33+
for key in ss_cnt:
34+
support = ss_cnt[key] # 每个项的支持度
35+
if support >= min_support:
36+
ret_list.insert(0, key) # 将满足最小支持度的项存入集合
37+
freq_items[key] = support #
38+
return ret_list
39+
40+
41+
def apriori_gen(lk, k):
42+
"""
43+
由Lk的频繁项集生成新的候选项集 连接过程
44+
:param lk: 频繁项集集合
45+
:param k: k 表示集合中所含的元素个数
46+
:return: 候选项集集合
47+
"""
48+
ret_list = []
49+
for i in range(len(lk)):
50+
for j in range(i + 1, len(lk)):
51+
l1 = list(lk[i])[:k - 2]
52+
l2 = list(lk[j])[:k - 2]
53+
l1.sort()
54+
l2.sort()
55+
if l1 == l2:
56+
ret_list.append(lk[i] | lk[j]) # 求并集
57+
# retList.sort()
58+
return ret_list
59+
60+
61+
def apriori_zc(data_set, data_set_dict, min_support=5):
62+
"""
63+
Apriori算法过程
64+
:param data_set: 数据集
65+
:param min_support: 最小支持度,默认值 0.5
66+
:return:
67+
"""
68+
c1 = init_c1(data_set_dict, min_support)
69+
data = map(set, data_set) # 将dataSet集合化,以满足scanD的格式要求
70+
freq_items = {}
71+
l1 = scan_data(data, c1, min_support, freq_items) # 构建初始的频繁项集
72+
l = [l1]
73+
# 最初的L1中的每个项集含有一个元素,新生成的项集应该含有2个元素,所以 k=2
74+
k = 2
75+
while len(l[k - 2]) > 0:
76+
ck = apriori_gen(l[k - 2], k)
77+
lk = scan_data(data, ck, min_support, freq_items)
78+
l.append(lk)
79+
k += 1 # 新生成的项集中的元素个数应不断增加
80+
return freq_items

22data-mining/frequent_patterns/dataSet/connectPro.txt

Lines changed: 10001 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)