Skip to content

Commit 4516e0a

Browse files
improvements
1 parent 2f4add7 commit 4516e0a

File tree

2 files changed

+129
-91
lines changed

2 files changed

+129
-91
lines changed

gsp.py

Lines changed: 101 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -10,97 +10,117 @@
1010
Example:
1111
1212
transactions = [
13-
['Bread', 'Milk'],
14-
['Bread', 'Diaper', 'Beer', 'Eggs'],
15-
['Milk', 'Diaper', 'Beer', 'Coke'],
16-
['Bread', 'Milk', 'Diaper', 'Beer'],
17-
['Bread', 'Milk', 'Diaper', 'Coke']
18-
]
13+
['Bread', 'Milk'],
14+
['Bread', 'Diaper', 'Beer', 'Eggs'],
15+
['Milk', 'Diaper', 'Beer', 'Coke'],
16+
['Bread', 'Milk', 'Diaper', 'Beer'],
17+
['Bread', 'Milk', 'Diaper', 'Coke']
18+
]
1919
"""
2020

2121
import logging
2222
import numpy as np
2323

24-
from itertools import combinations
24+
from collections import Counter
25+
from itertools import chain
26+
from itertools import product
2527

2628
__author__ = "Jackson Antonio do Prado Lima"
2729
__email__ = "[email protected]"
2830
__license__ = "GPL"
2931
__version__ = "1.0"
3032

31-
class GSP:
32-
def __init__(self, raw_transactions):
33-
self.freq_patterns = []
34-
self._pre_processing(raw_transactions)
35-
36-
def _pre_processing(self, raw_transactions):
37-
'''
38-
Prepare the data
39-
40-
Parameters:
41-
raw_transactions: the data that it will be analysed
42-
'''
43-
# each item is parsed to a set type
44-
self.transactions = [set(np.array(i)) for i in raw_transactions]
45-
46-
# different items in the base
47-
self.unique_candidates = [set([item]) for item in set(x for l in self.transactions for x in l)]
48-
49-
# the total of different items in the base
50-
#self.t_size = len(self.unique_candidates)
51-
52-
def _support(self, items, minsup = 0):
53-
'''
54-
The support count (or simply support) for a sequence is defined as the fraction of total data-sequences that "contain" this sequence.
55-
(Although the word "contains" is not strictly accurate once we incorporate taxonomies,
56-
it captures the spirt of when a data-sequence contributes to the support of a sequential pattern.)
57-
58-
Parameters
59-
items: set of items that will be evaluated
60-
minsup: minimum support
61-
'''
62-
results = {}
63-
for item in items:
64-
# The number of times the item appears in the transactions
65-
frequency = len([t for t in self.transactions if item.issubset(t)])
66-
67-
if frequency >= minsup:
68-
results[tuple(item)] = [frequency]
69-
return results
70-
71-
def _print_status(self, run, candidates):
72-
logging.debug("Run {}\nThere are {} candidates.\nThe candidates have been filtered down to {}.\n".format(run, len(candidates), len(self.freq_patterns[run-1])))
7333

74-
def search(self, minsup = 0.2):
75-
'''
76-
Run GSP mining algorithm
77-
78-
Parameters
79-
minsup: minimum support
80-
'''
81-
assert (0.0 < minsup) and (minsup <= 1.0)
82-
minsup = len(self.transactions) * minsup
83-
84-
# the set of frequent 1-sequence: all singleton sequences (k-itemsets/k-sequence = 1) - Initially, every item in DB is a candidate
85-
candidates = self.unique_candidates
86-
87-
# scan transactions to collect support count for each candidate sequence & filter
88-
self.freq_patterns.append(self._support(candidates, minsup))
89-
90-
# (k-itemsets/k-sequence = 1)
91-
k_items = 1
92-
93-
self._print_status(k_items, candidates)
94-
95-
# repeat until no frequent sequence or no candidate can be found
96-
while len(self.freq_patterns[k_items - 1]):
97-
k_items += 1
98-
99-
# Generate candidate sets Ck (set of candidate k-sequences) - generate new candidates from the last "best" candidates filtered by minimum support
100-
candidates = [set(c) for c in combinations(set(x for l in self.freq_patterns[k_items - 2].keys() for x in l), k_items)]
101-
102-
# candidate pruning - eliminates candidates who are not potentially frequent (using support as threshold)
103-
self.freq_patterns.append(self._support(candidates, minsup))
34+
class GSP:
10435

105-
self._print_status(k_items, candidates)
106-
return self.freq_patterns[:-1]
36+
def __init__(self, raw_transactions):
37+
self.freq_patterns = []
38+
self._pre_processing(raw_transactions)
39+
40+
def _pre_processing(self, raw_transactions):
41+
'''
42+
Prepare the data
43+
44+
Parameters:
45+
raw_transactions: the data that it will be analysed
46+
'''
47+
self.max_size = max([len(item) for item in raw_transactions])
48+
self.transactions = [tuple(list(i)) for i in raw_transactions]
49+
counts = Counter(chain.from_iterable(raw_transactions))
50+
self.unique_candidates = [tuple([k]) for k, c in counts.items()]
51+
52+
def is_slice_in_list(self, s, l):
53+
len_s = len(s) # so we don't recompute length of s on every iteration
54+
return any(s == l[i:len_s+i] for i in range(len(l) - len_s+1))
55+
56+
def _support(self, items, minsup=0):
57+
'''
58+
The support count (or simply support) for a sequence is defined as
59+
the fraction of total data-sequences that "contain" this sequence.
60+
(Although the word "contains" is not strictly accurate once we
61+
incorporate taxonomies, it captures the spirt of when a data-sequence
62+
contributes to the support of a sequential pattern.)
63+
64+
Parameters
65+
items: set of items that will be evaluated
66+
minsup: minimum support
67+
'''
68+
results = {}
69+
for item in items:
70+
# The number of times the item appears in the transactions
71+
frequency = len(
72+
[t for t in self.transactions if self.is_slice_in_list(item, t)])
73+
if frequency >= minsup:
74+
results[item] = frequency
75+
return results
76+
77+
def _print_status(self, run, candidates):
78+
logging.debug("""
79+
Run {}
80+
There are {} candidates.
81+
The candidates have been filtered down to {}.\n"""
82+
.format(run,
83+
len(candidates),
84+
len(self.freq_patterns[run-1])))
85+
86+
def search(self, minsup=0.2):
87+
'''
88+
Run GSP mining algorithm
89+
90+
Parameters
91+
minsup: minimum support
92+
'''
93+
assert (0.0 < minsup) and (minsup <= 1.0)
94+
minsup = len(self.transactions) * minsup
95+
96+
# the set of frequent 1-sequence: all singleton sequences
97+
# (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
98+
# candidate
99+
candidates = self.unique_candidates
100+
101+
# scan transactions to collect support count for each candidate
102+
# sequence & filter
103+
self.freq_patterns.append(self._support(candidates, minsup))
104+
105+
# (k-itemsets/k-sequence = 1)
106+
k_items = 1
107+
108+
self._print_status(k_items, candidates)
109+
110+
# repeat until no frequent sequence or no candidate can be found
111+
while len(self.freq_patterns[k_items - 1]) and (k_items + 1 <= self.max_size):
112+
k_items += 1
113+
114+
# Generate candidate sets Ck (set of candidate k-sequences) -
115+
# generate new candidates from the last "best" candidates filtered
116+
# by minimum support
117+
items = np.unique(
118+
list(set(self.freq_patterns[k_items - 2].keys())))
119+
candidates = list(product(items, repeat=k_items))
120+
121+
# candidate pruning - eliminates candidates who are not potentially
122+
# frequent (using support as threshold)
123+
self.freq_patterns.append(self._support(candidates, minsup))
124+
125+
self._print_status(k_items, candidates)
126+
return self.freq_patterns[:-1]

main.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,35 @@
11
import argparse
2-
import logging
2+
import logging
3+
import random
34

45
from gsp import GSP
56

67
logging.basicConfig(level=logging.DEBUG)
78

8-
# Considers these transactions
9-
items = [
10-
[1, 2],
11-
[1, 3, 4, 6],
12-
[2, 3, 4, 5],
13-
[1, 2, 3, 4],
14-
[1, 2, 3, 5]
15-
]
169

17-
print(GSP(items).search(0.3))
10+
def create_transactions(minsize, maxsize, minvalue, maxvalue):
11+
return [random.randint(minvalue, maxvalue)
12+
for _ in range(random.randint(minsize, maxsize))]
13+
14+
minsize, maxsize, minvalue, maxvalue = 2, 256, 0, 5
15+
16+
transactions = [create_transactions(
17+
minsize, maxsize, minvalue, maxvalue) for _ in range(100)]
18+
19+
# transactions = [
20+
# ['Bread', 'Milk'],
21+
# ['Bread', 'Diaper', 'Beer', 'Eggs'],
22+
# ['Milk', 'Diaper', 'Beer', 'Coke'],
23+
# ['Bread', 'Milk', 'Diaper', 'Beer'],
24+
# ['Bread', 'Milk', 'Diaper', 'Coke']
25+
# ]
26+
27+
# transactions = [[3, 5, 2, 0, 4, 4, 1, 1], [2, 5, 5], [5, 3, 2, 4, 4, 0, 4], [4, 3, 0, 0], [
28+
# 1, 0, 4, 0, 0, 4], [2, 5, 1, 3, 5, 2, 5, 3], [0, 4, 0, 4, 5], [4, 2],
29+
# [5], [2, 3, 0, 0, 0, 3, 0, 2, 3]]
30+
31+
result = GSP(transactions).search(0.3)
32+
33+
print("========= Status =========")
34+
print("Transactions: {}".format(transactions))
35+
print("GSP: {}".format(result))

0 commit comments

Comments
 (0)