Skip to content

Commit 04becb1

Browse files
committed
fixed issues
1 parent f0b0bde commit 04becb1

File tree

4 files changed

+31
-6
lines changed

4 files changed

+31
-6
lines changed

gsppy/accelerate.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _env_backend() -> str:
7676

7777
def _encode_transactions(transactions: List[Tuple[str, ...]]) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
7878
"""Encode transactions of strings into integer IDs.
79-
79+
8080
Parameters:
8181
transactions: List of transactions where each transaction is a tuple of strings.
8282
@@ -184,6 +184,9 @@ def support_counts(
184184
- "python": force pure-Python fallback
185185
- otherwise: try Rust first and fall back to Python
186186
"""
187+
# Intentionally fallback to Python for non-contiguous queries.
188+
# The acceleration path is currently disabled for non-contiguous cases
189+
# to facilitate testing and validation of the contiguous logic.
187190
if not contiguous:
188191
return support_counts_python(
189192
transactions, candidates, min_support_abs, batch_size, contiguous
@@ -258,7 +261,6 @@ def support_counts(
258261
for enc_cand, freq in result:
259262
out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
260263
return out_rust
261-
262264
# auto: try rust then fallback
263265
if _rust_available:
264266
try:

gsppy/gsp.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,13 @@ def _support_python(
230230
"""
231231
# Split candidates into batches
232232
batches = list(split_into_batches(items, batch_size))
233-
subsequence_checker = is_subsequence_in_list if contiguous else is_subsequence_non_contiguous
233+
#subsequence_checker = is_subsequence_in_list if contiguous else is_subsequence_non_contiguous
234234

235235
# Use multiprocessing pool to calculate frequency in parallel, batch-wise
236236
with mp.Pool(processes=mp.cpu_count()) as pool:
237237
batch_results = pool.starmap(
238238
self._worker_batch, # Process a batch at a time
239-
[(batch, self.transactions, min_support,subsequence_checker) for batch in batches],
239+
[(batch, self.transactions, min_support,contiguous) for batch in batches],
240240
)
241241

242242
# Flatten the list of results and convert to a dictionary
@@ -279,6 +279,7 @@ def search(
279279
min_support: float = 0.2,
280280
max_k: Optional[int] = None,
281281
backend: Optional[str] = None,
282+
batch_size: int = 100,
282283
contiguous: bool = False,
283284
) -> List[Dict[Tuple[str, ...], int]]:
284285
"""

gsppy/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,13 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
6868
# Use any to check if any slice matches the sequence
6969
return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
7070

71-
@lru_cache(maxsize=None)
71+
@lru_cache(maxsize=32768)
7272
def is_subsequence_non_contiguous(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
7373
"""
7474
Check if a subsequence exists within a sequence, allowing for gaps (non-contiguous).
7575
"""
7676
if not subsequence:
77-
return False
77+
return True
7878
it = iter(sequence)
7979
return all(item in it for item in subsequence)
8080

tests/test_gsp.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,3 +268,25 @@ def test_gsp_enhancement_contiguous_vs_non_contiguous():
268268
found_ac_contiguous = any(('a', 'c') in d for d in result_contiguous)
269269
assert found_ac_contiguous is False, \
270270
"The pattern ('a', 'c') should NOT be found with a strict contiguous search."
271+
272+
def test_non_contiguous_multiprocessing():
273+
# Dataset where ('a','c') is a non‑contiguous subsequence but not a contiguous one.
274+
sequences = [
275+
['a', 'b', 'c'],
276+
['a', 'c'],
277+
['b', 'c', 'a'],
278+
['a', 'b', 'c', 'd'],
279+
]
280+
gsp = GSP(sequences)
281+
282+
# Use a tiny batch size to force multiple batches and trigger multiprocessing.
283+
result_non_contig = gsp.search(min_support=0.5, contiguous=False, backend='python', batch_size=1)
284+
# In non‑contiguous mode, ('a','c') should be considered frequent (support = 3/4).
285+
assert any(('a', 'c') in level for level in result_non_contig), \
286+
"Expected to find ('a','c') as a non‑contiguous frequent subsequence"
287+
print(result_non_contig)
288+
# Also verify that contiguous search does not report ('a','c').
289+
result_contig = gsp.search(min_support=0.5, contiguous=True, backend='python', batch_size=1)
290+
print(result_contig)
291+
assert not any(('a', 'c') in level for level in result_contig), \
292+
"('a','c') should not appear in a strict contiguous search"

0 commit comments

Comments
 (0)