1010Example:
1111
1212transactions = [
13- ['Bread', 'Milk'],
14- ['Bread', 'Diaper', 'Beer', 'Eggs'],
15- ['Milk', 'Diaper', 'Beer', 'Coke'],
16- ['Bread', 'Milk', 'Diaper', 'Beer'],
17- ['Bread', 'Milk', 'Diaper', 'Coke']
18- ]
13+ ['Bread', 'Milk'],
14+ ['Bread', 'Diaper', 'Beer', 'Eggs'],
15+ ['Milk', 'Diaper', 'Beer', 'Coke'],
16+ ['Bread', 'Milk', 'Diaper', 'Beer'],
17+ ['Bread', 'Milk', 'Diaper', 'Coke']
18+ ]
1919"""
2020
2121import logging
2222import numpy as np
2323
24- from itertools import combinations
24+ from collections import Counter
25+ from itertools import chain
26+ from itertools import product
2527
2628__author__ = "Jackson Antonio do Prado Lima"
27292830__license__ = "GPL"
2931__version__ = "1.0"
3032
31- class GSP :
32- def __init__ (self , raw_transactions ):
33- self .freq_patterns = []
34- self ._pre_processing (raw_transactions )
35-
36- def _pre_processing (self , raw_transactions ):
37- '''
38- Prepare the data
39-
40- Parameters:
41- raw_transactions: the data that it will be analysed
42- '''
43- # each item is parsed to a set type
44- self .transactions = [set (np .array (i )) for i in raw_transactions ]
45-
46- # different items in the base
47- self .unique_candidates = [set ([item ]) for item in set (x for l in self .transactions for x in l )]
48-
49- # the total of different items in the base
50- #self.t_size = len(self.unique_candidates)
51-
52- def _support (self , items , minsup = 0 ):
53- '''
54- The support count (or simply support) for a sequence is defined as the fraction of total data-sequences that "contain" this sequence.
55- (Although the word "contains" is not strictly accurate once we incorporate taxonomies,
56- it captures the spirt of when a data-sequence contributes to the support of a sequential pattern.)
57-
58- Parameters
59- items: set of items that will be evaluated
60- minsup: minimum support
61- '''
62- results = {}
63- for item in items :
64- # The number of times the item appears in the transactions
65- frequency = len ([t for t in self .transactions if item .issubset (t )])
66-
67- if frequency >= minsup :
68- results [tuple (item )] = [frequency ]
69- return results
70-
71- def _print_status (self , run , candidates ):
72- logging .debug ("Run {}\n There are {} candidates.\n The candidates have been filtered down to {}.\n " .format (run , len (candidates ), len (self .freq_patterns [run - 1 ])))
7333
74- def search (self , minsup = 0.2 ):
75- '''
76- Run GSP mining algorithm
77-
78- Parameters
79- minsup: minimum support
80- '''
81- assert (0.0 < minsup ) and (minsup <= 1.0 )
82- minsup = len (self .transactions ) * minsup
83-
84- # the set of frequent 1-sequence: all singleton sequences (k-itemsets/k-sequence = 1) - Initially, every item in DB is a candidate
85- candidates = self .unique_candidates
86-
87- # scan transactions to collect support count for each candidate sequence & filter
88- self .freq_patterns .append (self ._support (candidates , minsup ))
89-
90- # (k-itemsets/k-sequence = 1)
91- k_items = 1
92-
93- self ._print_status (k_items , candidates )
94-
95- # repeat until no frequent sequence or no candidate can be found
96- while len (self .freq_patterns [k_items - 1 ]):
97- k_items += 1
98-
99- # Generate candidate sets Ck (set of candidate k-sequences) - generate new candidates from the last "best" candidates filtered by minimum support
100- candidates = [set (c ) for c in combinations (set (x for l in self .freq_patterns [k_items - 2 ].keys () for x in l ), k_items )]
101-
102- # candidate pruning - eliminates candidates who are not potentially frequent (using support as threshold)
103- self .freq_patterns .append (self ._support (candidates , minsup ))
34+ class GSP :
10435
105- self ._print_status (k_items , candidates )
106- return self .freq_patterns [:- 1 ]
36+ def __init__ (self , raw_transactions ):
37+ self .freq_patterns = []
38+ self ._pre_processing (raw_transactions )
39+
40+ def _pre_processing (self , raw_transactions ):
41+ '''
42+ Prepare the data
43+
44+ Parameters:
45+ raw_transactions: the data that it will be analysed
46+ '''
47+ self .max_size = max ([len (item ) for item in raw_transactions ])
48+ self .transactions = [tuple (list (i )) for i in raw_transactions ]
49+ counts = Counter (chain .from_iterable (raw_transactions ))
50+ self .unique_candidates = [tuple ([k ]) for k , c in counts .items ()]
51+
52+ def is_slice_in_list (self , s , l ):
53+ len_s = len (s ) # so we don't recompute length of s on every iteration
54+ return any (s == l [i :len_s + i ] for i in range (len (l ) - len_s + 1 ))
55+
56+ def _support (self , items , minsup = 0 ):
57+ '''
58+ The support count (or simply support) for a sequence is defined as
59+ the fraction of total data-sequences that "contain" this sequence.
60+ (Although the word "contains" is not strictly accurate once we
61+ incorporate taxonomies, it captures the spirt of when a data-sequence
62+ contributes to the support of a sequential pattern.)
63+
64+ Parameters
65+ items: set of items that will be evaluated
66+ minsup: minimum support
67+ '''
68+ results = {}
69+ for item in items :
70+ # The number of times the item appears in the transactions
71+ frequency = len (
72+ [t for t in self .transactions if self .is_slice_in_list (item , t )])
73+ if frequency >= minsup :
74+ results [item ] = frequency
75+ return results
76+
77+ def _print_status (self , run , candidates ):
78+ logging .debug ("""
79+ Run {}
80+ There are {} candidates.
81+ The candidates have been filtered down to {}.\n """
82+ .format (run ,
83+ len (candidates ),
84+ len (self .freq_patterns [run - 1 ])))
85+
86+ def search (self , minsup = 0.2 ):
87+ '''
88+ Run GSP mining algorithm
89+
90+ Parameters
91+ minsup: minimum support
92+ '''
93+ assert (0.0 < minsup ) and (minsup <= 1.0 )
94+ minsup = len (self .transactions ) * minsup
95+
96+ # the set of frequent 1-sequence: all singleton sequences
97+ # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
98+ # candidate
99+ candidates = self .unique_candidates
100+
101+ # scan transactions to collect support count for each candidate
102+ # sequence & filter
103+ self .freq_patterns .append (self ._support (candidates , minsup ))
104+
105+ # (k-itemsets/k-sequence = 1)
106+ k_items = 1
107+
108+ self ._print_status (k_items , candidates )
109+
110+ # repeat until no frequent sequence or no candidate can be found
111+ while len (self .freq_patterns [k_items - 1 ]) and (k_items + 1 <= self .max_size ):
112+ k_items += 1
113+
114+ # Generate candidate sets Ck (set of candidate k-sequences) -
115+ # generate new candidates from the last "best" candidates filtered
116+ # by minimum support
117+ items = np .unique (
118+ list (set (self .freq_patterns [k_items - 2 ].keys ())))
119+ candidates = list (product (items , repeat = k_items ))
120+
121+ # candidate pruning - eliminates candidates who are not potentially
122+ # frequent (using support as threshold)
123+ self .freq_patterns .append (self ._support (candidates , minsup ))
124+
125+ self ._print_status (k_items , candidates )
126+ return self .freq_patterns [:- 1 ]
0 commit comments