Skip to content

Commit 972609a

Browse files
feat: add optional GPU backend and CLI/API backend selector
Introduce experimental GPU backend using CuPy to accelerate singleton (k=1) support counting; non-singleton candidates fall back to Rust (if available) or Python Expose backend selection across layers: accelerate.support_counts now accepts backend: Optional[str] (overrides GSPPY_BACKEND env when provided) GSP._support and GSP.search accept a backend parameter and forward it to the acceleration layer Add --backend option to CLI (choices: auto, python, rust, gpu); when non-auto, it sets GSPPY_BACKEND for the run Update README with a new GPU acceleration section, installation via optional extra, runtime selection instructions, and revised CLI examples including --backend Add gpu extra in pyproject.toml (cupy>=11,<14) to keep GPU dependencies optional Maintain default CPU behavior (auto: try Rust, then Python) for backward compatibility
1 parent 2d98162 commit 972609a

File tree

5 files changed

+170
-17
lines changed

5 files changed

+170
-17
lines changed

README.md

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,26 @@ GSPPY_BACKEND=auto uv run --python .venv/bin/python --no-project \
138138
python benchmarks/bench_support.py --n_tx 1000000 --tx_len 8 --vocab 50000 --min_support 0.2 --warmup
139139
```
140140

141-
#### 4. Common development tasks
141+
#### 4. Optional: Enable GPU (CuPy) acceleration
142+
143+
GPU acceleration is experimental and currently optimizes singleton (k=1) support counting using CuPy.
144+
Non-singleton candidates fall back to the Rust/Python backend.
145+
146+
Install the optional extra (choose a CuPy build that matches your CUDA/ROCm setup if needed):
147+
148+
```bash
149+
uv run pip install -e .[gpu]
150+
```
151+
152+
Select the GPU backend at runtime:
153+
154+
```bash
155+
export GSPPY_BACKEND=gpu
156+
```
157+
158+
If a GPU isn't available, an error will be raised when GSPPY_BACKEND=gpu is set. Otherwise, the default "auto" uses CPU.
159+
160+
#### 5. Common development tasks
142161
After the environment is ready, activate it and run tasks with standard tools:
143162

144163
```bash
@@ -228,19 +247,20 @@ Your input file should be either:
228247
Use the following command to run GSPPy on your data:
229248

230249
```bash
231-
gsppy --file path/to/transactions.json --min_support 0.3
250+
gsppy --file path/to/transactions.json --min_support 0.3 --backend auto
232251
```
233252

234253
Or for CSV files:
235254

236255
```bash
237-
gsppy --file path/to/transactions.csv --min_support 0.3
256+
gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
238257
```
239258

240259
#### CLI Options
241260

242261
- `--file`: Path to your input file (JSON or CSV). **Required**.
243262
- `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
263+
- `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
244264
- `--verbose`: (Optional) Enable detailed output for debugging.
245265

246266
#### Example

gsppy/accelerate.py

Lines changed: 118 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,25 @@
1515
from __future__ import annotations
1616

1717
import os
18-
from typing import Any, Dict, List, Tuple, cast
18+
from typing import Any, Dict, List, Tuple, Optional, cast
1919

2020
from .utils import split_into_batches, is_subsequence_in_list
2121

22+
# Optional GPU (CuPy) support
23+
_gpu_available = False
24+
try: # pragma: no cover - optional dependency path
25+
import cupy as _cp_mod # type: ignore[import-not-found]
26+
27+
cp = cast(Any, _cp_mod)
28+
29+
try:
30+
_gpu_available = cp.cuda.runtime.getDeviceCount() > 0 # type: ignore[attr-defined]
31+
except Exception:
32+
_gpu_available = False
33+
except Exception: # pragma: no cover - optional dependency path
34+
cp = None # type: ignore[assignment]
35+
_gpu_available = False
36+
2237
# Simple per-process cache for encoded transactions keyed by the list object's id
2338
_ENCODED_CACHE: Dict[int, Tuple[List[List[int]], Dict[int, str], Dict[str, int], int]] = {}
2439

@@ -89,6 +104,40 @@ def _encode_candidates(candidates: List[Tuple[str, ...]], vocab: Dict[str, int])
89104
return [[vocab[s] for s in cand] for cand in candidates]
90105

91106

107+
def _support_counts_gpu_singletons(
108+
enc_tx: List[List[int]],
109+
cand_ids: List[int],
110+
min_support_abs: int,
111+
vocab_size: int,
112+
) -> List[Tuple[List[int], int]]:
113+
"""GPU-accelerated support counts for singleton candidates using CuPy.
114+
115+
This computes the number of transactions containing each candidate item ID.
116+
It uniquifies items per transaction on CPU to preserve presence semantics,
117+
then performs a single bincount on GPU.
118+
"""
119+
# Ensure one contribution per transaction
120+
unique_rows: List[List[int]] = [list(set(row)) for row in enc_tx]
121+
if not unique_rows:
122+
return []
123+
124+
# Flatten to a 1D list of item ids, then move to GPU
125+
flat: List[int] = [item for row in unique_rows for item in row]
126+
if not flat:
127+
return []
128+
129+
cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined]
130+
counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined]
131+
counts_host: Any = counts.get() # back to host as a NumPy array
132+
133+
out: List[Tuple[List[int], int]] = []
134+
for cid in cand_ids:
135+
freq = int(counts_host[cid])
136+
if freq >= min_support_abs:
137+
out.append(([cid], freq))
138+
return out
139+
140+
92141
def support_counts_python(
93142
transactions: List[Tuple[str, ...]],
94143
candidates: List[Tuple[str, ...]],
@@ -118,30 +167,91 @@ def support_counts(
118167
candidates: List[Tuple[str, ...]],
119168
min_support_abs: int,
120169
batch_size: int = 100,
170+
backend: Optional[str] = None,
121171
) -> Dict[Tuple[str, ...], int]:
122172
"""Choose the best available backend for support counting.
123173
124-
Backend selection is controlled by the env var GSPPY_BACKEND:
174+
Backend selection is controlled by the `backend` argument when provided,
175+
otherwise by the env var GSPPY_BACKEND:
125176
- "rust": require Rust extension (raise if missing)
177+
- "gpu": try GPU path when available (currently singletons optimized),
178+
fall back to CPU for the rest
126179
- "python": force pure-Python fallback
127180
- otherwise: try Rust first and fall back to Python
128181
"""
129-
backend = _env_backend()
182+
backend_sel = (backend or _env_backend()).lower()
130183

131-
if backend == "python":
184+
if backend_sel == "gpu":
185+
if not _gpu_available:
186+
raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available")
187+
# Encode once
188+
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
189+
enc_cands = _encode_candidates(candidates, vocab)
190+
191+
# Partition candidates into singletons and non-singletons
192+
singletons: List[Tuple[int, Tuple[str, ...]]] = []
193+
others: List[Tuple[List[int], Tuple[str, ...]]] = []
194+
# Pair original and encoded candidates; lengths should match
195+
assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch"
196+
for orig, enc in zip(candidates, enc_cands): # noqa: B905 - lengths checked above
197+
if len(enc) == 1:
198+
singletons.append((enc[0], orig))
199+
else:
200+
others.append((enc, orig))
201+
202+
out: Dict[Tuple[str, ...], int] = {}
203+
204+
# GPU path for singletons
205+
if singletons:
206+
vocab_size = max(vocab.values()) + 1 if vocab else 0
207+
gpu_res = _support_counts_gpu_singletons(
208+
enc_tx=enc_tx,
209+
cand_ids=[cid for cid, _ in singletons],
210+
min_support_abs=min_support_abs,
211+
vocab_size=vocab_size,
212+
)
213+
# Map back to original strings
214+
cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons}
215+
for enc_cand, freq in gpu_res:
216+
cid = enc_cand[0]
217+
out[cand_by_id[cid]] = int(freq)
218+
219+
# Fallback for others (prefer rust when available)
220+
if others:
221+
if _rust_available:
222+
try:
223+
other_enc = [enc for enc, _ in others]
224+
res = cast(
225+
List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, other_enc, int(min_support_abs))
226+
)
227+
for enc_cand, freq in res:
228+
out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
229+
except Exception:
230+
# fallback to python
231+
out.update(
232+
support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
233+
)
234+
else:
235+
out.update(
236+
support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
237+
)
238+
239+
return out
240+
241+
if backend_sel == "python":
132242
return support_counts_python(transactions, candidates, min_support_abs, batch_size)
133243

134-
if backend == "rust":
244+
if backend_sel == "rust":
135245
if not _rust_available:
136246
raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available")
137247
# use rust
138248
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
139249
enc_cands = _encode_candidates(candidates, vocab)
140250
result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
141-
out: Dict[Tuple[str, ...], int] = {}
251+
out_rust: Dict[Tuple[str, ...], int] = {}
142252
for enc_cand, freq in result:
143-
out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
144-
return out
253+
out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
254+
return out_rust
145255

146256
# auto: try rust then fallback
147257
if _rust_available:

gsppy/cli.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,15 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
156156
type=float,
157157
help="Minimum support threshold as a fraction of total transactions.",
158158
)
159+
@click.option(
160+
"--backend",
161+
type=click.Choice(["auto", "python", "rust", "gpu"], case_sensitive=False),
162+
default="auto",
163+
show_default=True,
164+
help="Backend to use for support counting.",
165+
)
159166
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
160-
def main(file_path: str, min_support: float, verbose: bool) -> None:
167+
def main(file_path: str, min_support: float, backend: str, verbose: bool) -> None:
161168
"""
162169
Run the GSP algorithm on transactional data from a file.
163170
"""
@@ -175,6 +182,10 @@ def main(file_path: str, min_support: float, verbose: bool) -> None:
175182
logger.error("Error: min_support must be in the range (0.0, 1.0].")
176183
sys.exit(1)
177184

185+
# Select backend for acceleration layer
186+
if backend and backend.lower() != "auto":
187+
os.environ["GSPPY_BACKEND"] = backend.lower()
188+
178189
# Initialize and run GSP algorithm
179190
try:
180191
gsp = GSP(transactions)

gsppy/gsp.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,15 +235,19 @@ def _support_python(
235235
return {item: freq for batch in batch_results for item, freq in batch}
236236

237237
def _support(
238-
self, items: List[Tuple[str, ...]], min_support: int = 0, batch_size: int = 100
238+
self,
239+
items: List[Tuple[str, ...]],
240+
min_support: int = 0,
241+
batch_size: int = 100,
242+
backend: Optional[str] = None,
239243
) -> Dict[Tuple[str, ...], int]:
240244
"""
241245
Calculate support counts for candidate sequences using the fastest available backend.
242246
This will try the Rust extension if available (and configured), otherwise fall back to
243247
the Python multiprocessing implementation.
244248
"""
245249
try:
246-
return support_counts_accel(self.transactions, items, min_support, batch_size)
250+
return support_counts_accel(self.transactions, items, min_support, batch_size, backend=backend)
247251
except Exception:
248252
# Fallback to Python implementation on any acceleration failure
249253
return self._support_python(items, min_support, batch_size)
@@ -261,7 +265,12 @@ def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
261265
"""
262266
logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
263267

264-
def search(self, min_support: float = 0.2, max_k: Optional[int] = None) -> List[Dict[Tuple[str, ...], int]]:
268+
def search(
269+
self,
270+
min_support: float = 0.2,
271+
max_k: Optional[int] = None,
272+
backend: Optional[str] = None,
273+
) -> List[Dict[Tuple[str, ...], int]]:
265274
"""
266275
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
267276
@@ -302,7 +311,7 @@ def search(self, min_support: float = 0.2, max_k: Optional[int] = None) -> List[
302311

303312
# scan transactions to collect support count for each candidate
304313
# sequence & filter
305-
self.freq_patterns.append(self._support(candidates, abs_min_support))
314+
self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
306315

307316
# (k-itemsets/k-sequence = 1)
308317
k_items = 1
@@ -323,7 +332,7 @@ def search(self, min_support: float = 0.2, max_k: Optional[int] = None) -> List[
323332

324333
# candidate pruning - eliminates candidates who are not potentially
325334
# frequent (using support as threshold)
326-
self.freq_patterns.append(self._support(candidates, abs_min_support))
335+
self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
327336

328337
self._print_status(k_items, candidates)
329338
logger.info("GSP algorithm completed.")

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ dev = [
5454
rust = [
5555
"maturin==1.6.0"
5656
]
57+
gpu = [
58+
"cupy>=11,<14"
59+
]
5760

5861
[tool.hatch.build]
5962
include = ["gsppy/*"]

0 commit comments

Comments
 (0)