|
1 | 1 | import base64 |
2 | | -import pickle |
| 2 | +import json |
3 | 3 | from abc import abstractmethod |
4 | 4 | from collections.abc import Callable |
5 | | -from typing import Any |
| 5 | +from typing import Any, ClassVar |
| 6 | + |
| 7 | +from datachain.plugins import ensure_plugins_loaded |
| 8 | + |
| 9 | + |
| 10 | +class CallableRegistry: |
| 11 | + _registry: ClassVar[dict[str, Callable]] = {} |
| 12 | + |
| 13 | + @classmethod |
| 14 | + def register(cls, callable_obj: Callable, name: str) -> str: |
| 15 | + cls._registry[name] = callable_obj |
| 16 | + return name |
| 17 | + |
| 18 | + @classmethod |
| 19 | + def get(cls, name: str) -> Callable: |
| 20 | + return cls._registry[name] |
6 | 21 |
|
7 | 22 |
|
8 | 23 | class Serializable: |
| 24 | + @classmethod |
| 25 | + @abstractmethod |
| 26 | + def serialize_callable_name(cls) -> str: |
| 27 | + """Return the registered name used for this class' factory callable.""" |
| 28 | + |
9 | 29 | @abstractmethod |
10 | 30 | def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]: |
11 | | - """ |
12 | | - Returns the class, args, and kwargs needed to instantiate a cloned copy |
13 | | - of this instance for use in separate processes or machines. |
14 | | - """ |
| 31 | + """Return (callable, args, kwargs) necessary to recreate this object.""" |
| 32 | + |
| 33 | + def _prepare(self, params: tuple) -> dict: |
| 34 | + callable, args, kwargs = params |
| 35 | + callable_name = callable.__self__.serialize_callable_name() |
| 36 | + return { |
| 37 | + "callable": callable_name, |
| 38 | + "args": args, |
| 39 | + "kwargs": { |
| 40 | + k: self._prepare(v) if isinstance(v, tuple) else v |
| 41 | + for k, v in kwargs.items() |
| 42 | + }, |
| 43 | + } |
15 | 44 |
|
16 | 45 | def serialize(self) -> str: |
17 | | - """ |
18 | | - Returns a string representation of clone params. |
19 | | - This is useful for storing the state of an object in environment variable. |
20 | | - """ |
21 | | - return base64.b64encode(pickle.dumps(self.clone_params())).decode() |
| 46 | + """Return a base64-encoded JSON string with registered callable + params.""" |
| 47 | + _ensure_default_callables_registered() |
| 48 | + data = self.clone_params() |
| 49 | + return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode() |
22 | 50 |
|
23 | 51 |
|
24 | 52 | def deserialize(s: str) -> Serializable: |
| 53 | + """Deserialize from base64-encoded JSON using only registered callables. |
| 54 | +
|
| 55 | + Nested serialized objects are instantiated automatically except for those |
| 56 | + passed via clone parameter tuples (keys ending with ``_clone_params``), |
| 57 | + which must remain as (callable, args, kwargs) for later factory usage. |
25 | 58 | """ |
26 | | - Returns a new instance of the class represented by the string. |
27 | | - """ |
28 | | - (f, args, kwargs) = pickle.loads(base64.b64decode(s.encode())) # noqa: S301 |
29 | | - return f(*args, **kwargs) |
| 59 | + ensure_plugins_loaded() |
| 60 | + _ensure_default_callables_registered() |
| 61 | + decoded = base64.b64decode(s.encode()) |
| 62 | + data = json.loads(decoded.decode()) |
| 63 | + |
| 64 | + def _is_serialized(obj: Any) -> bool: |
| 65 | + return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset( |
| 66 | + obj.keys() |
| 67 | + ) |
| 68 | + |
| 69 | + def _reconstruct(obj: Any, nested: bool = False) -> Any: |
| 70 | + if not _is_serialized(obj): |
| 71 | + return obj |
| 72 | + callable_name: str = obj["callable"] |
| 73 | + args: list[Any] = obj["args"] |
| 74 | + kwargs: dict[str, Any] = obj["kwargs"] |
| 75 | + # Recurse only inside kwargs because serialize() only nests through kwargs |
| 76 | + for k, v in list(kwargs.items()): |
| 77 | + if _is_serialized(v): |
| 78 | + kwargs[k] = _reconstruct(v, True) |
| 79 | + callable_obj = CallableRegistry.get(callable_name) |
| 80 | + if nested: |
| 81 | + return (callable_obj, args, kwargs) |
| 82 | + # Otherwise instantiate |
| 83 | + return callable_obj(*args, **kwargs) |
| 84 | + |
| 85 | + if not _is_serialized(data): |
| 86 | + raise ValueError("Invalid serialized data format") |
| 87 | + return _reconstruct(data, False) |
| 88 | + |
| 89 | + |
| 90 | +class _DefaultsState: |
| 91 | + registered = False |
| 92 | + |
| 93 | + |
| 94 | +def _ensure_default_callables_registered() -> None: |
| 95 | + if _DefaultsState.registered: |
| 96 | + return |
| 97 | + |
| 98 | + from datachain.data_storage.sqlite import ( |
| 99 | + SQLiteDatabaseEngine, |
| 100 | + SQLiteMetastore, |
| 101 | + SQLiteWarehouse, |
| 102 | + ) |
| 103 | + |
| 104 | + # Register (idempotent by name overwrite is fine) using class-level |
| 105 | + # serialization names to avoid hard-coded literals here. |
| 106 | + CallableRegistry.register( |
| 107 | + SQLiteDatabaseEngine.from_db_file, |
| 108 | + SQLiteDatabaseEngine.serialize_callable_name(), |
| 109 | + ) |
| 110 | + CallableRegistry.register( |
| 111 | + SQLiteMetastore.init_after_clone, |
| 112 | + SQLiteMetastore.serialize_callable_name(), |
| 113 | + ) |
| 114 | + CallableRegistry.register( |
| 115 | + SQLiteWarehouse.init_after_clone, |
| 116 | + SQLiteWarehouse.serialize_callable_name(), |
| 117 | + ) |
| 118 | + |
| 119 | + _DefaultsState.registered = True |
0 commit comments