Skip to content

Commit f59fdae

Browse files
PERF-#7657: Fork pandas eval() implementation.
Signed-off-by: sfc-gh-mvashishtha <[email protected]>
1 parent 8d468dc commit f59fdae

File tree

11 files changed

+2799
-8
lines changed

11 files changed

+2799
-8
lines changed

modin/core/computation/__init__.py

Whitespace-only changes.

modin/core/computation/align.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
"""
2+
Core eval alignment algorithms. Forked from pandas.core.computation.align
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import warnings
8+
from collections.abc import Sequence
9+
from functools import (
10+
partial,
11+
wraps,
12+
)
13+
from typing import (
14+
Callable,
15+
)
16+
17+
import numpy as np
18+
import pandas
19+
import pandas.core.common as com
20+
from pandas._typing import F
21+
from pandas.core.base import PandasObject
22+
from pandas.errors import PerformanceWarning
23+
24+
from modin.core.computation.common import result_type_many
25+
from modin.pandas import DataFrame, Series
26+
from modin.pandas.base import BasePandasDataset
27+
28+
29+
def _align_core_single_unary_op(
30+
term,
31+
) -> tuple[partial | type[BasePandasDataset], dict[str, pandas.Index] | None]:
32+
typ: partial | type[BasePandasDataset]
33+
axes: dict[str, pandas.Index] | None = None
34+
35+
if isinstance(term.value, np.ndarray):
36+
typ = partial(np.asanyarray, dtype=term.value.dtype)
37+
else:
38+
typ = type(term.value)
39+
if hasattr(term.value, "axes"):
40+
axes = _zip_axes_from_type(typ, term.value.axes)
41+
42+
return typ, axes
43+
44+
45+
def _zip_axes_from_type(
46+
typ: type[BasePandasDataset], new_axes: Sequence[pandas.Index]
47+
) -> dict[str, pandas.Index]:
48+
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
49+
50+
51+
def _any_pandas_objects(terms) -> bool:
52+
"""
53+
Check a sequence of terms for instances of PandasObject.
54+
"""
55+
return any(isinstance(term.value, PandasObject) for term in terms)
56+
57+
58+
def _filter_special_cases(f) -> Callable[[F], F]:
59+
@wraps(f)
60+
def wrapper(terms):
61+
# single unary operand
62+
if len(terms) == 1:
63+
return _align_core_single_unary_op(terms[0])
64+
65+
term_values = (term.value for term in terms)
66+
67+
# we don't have any pandas objects
68+
if not _any_pandas_objects(terms):
69+
return result_type_many(*term_values), None
70+
71+
return f(terms)
72+
73+
return wrapper
74+
75+
76+
@_filter_special_cases
77+
def _align_core(terms):
78+
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
79+
term_dims = [terms[i].value.ndim for i in term_index]
80+
81+
ndims = pandas.Series(dict(zip(term_index, term_dims)))
82+
83+
# initial axes are the axes of the largest-axis'd term
84+
biggest = terms[ndims.idxmax()].value
85+
typ = biggest._constructor
86+
axes = biggest.axes
87+
naxes = len(axes)
88+
gt_than_one_axis = naxes > 1
89+
90+
for value in (terms[i].value for i in term_index):
91+
is_series = isinstance(value, Series)
92+
is_series_and_gt_one_axis = is_series and gt_than_one_axis
93+
94+
for axis, items in enumerate(value.axes):
95+
if is_series_and_gt_one_axis:
96+
ax, itm = naxes - 1, value.index
97+
else:
98+
ax, itm = axis, items
99+
100+
if not axes[ax].is_(itm):
101+
axes[ax] = axes[ax].union(itm)
102+
103+
for i, ndim in ndims.items():
104+
for axis, items in zip(range(ndim), axes):
105+
ti = terms[i].value
106+
107+
if hasattr(ti, "reindex"):
108+
transpose = isinstance(ti, Series) and naxes > 1
109+
reindexer = axes[naxes - 1] if transpose else items
110+
111+
term_axis_size = len(ti.axes[axis])
112+
reindexer_size = len(reindexer)
113+
114+
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
115+
if ordm >= 1 and reindexer_size >= 10000:
116+
w = (
117+
f"Alignment difference on axis {axis} is larger "
118+
+ f"than an order of magnitude on term {repr(terms[i].name)}, "
119+
+ f"by more than {ordm:.4g}; performance may suffer."
120+
)
121+
warnings.warn(w, category=PerformanceWarning)
122+
123+
obj = ti.reindex(reindexer, axis=axis, copy=False)
124+
terms[i].update(obj)
125+
126+
terms[i].update(terms[i].value.values)
127+
128+
return typ, _zip_axes_from_type(typ, axes)
129+
130+
131+
def align_terms(terms):
132+
"""
133+
Align a set of terms.
134+
"""
135+
try:
136+
# flatten the parse tree (a nested list, really)
137+
terms = list(com.flatten(terms))
138+
except TypeError:
139+
# can't iterate so it must just be a constant or single variable
140+
if isinstance(terms.value, (Series, DataFrame)):
141+
typ = type(terms.value)
142+
return typ, _zip_axes_from_type(typ, terms.value.axes)
143+
return np.result_type(terms.type), None
144+
145+
# if all resolved variables are numeric scalars
146+
if all(term.is_scalar for term in terms):
147+
return result_type_many(*(term.value for term in terms)).type, None
148+
149+
# perform the main alignment
150+
typ, axes = _align_core(terms)
151+
return typ, axes
152+
153+
154+
def reconstruct_object(typ, obj, axes, dtype):
155+
"""
156+
Reconstruct an object given its type, raw value, and possibly empty
157+
(None) axes.
158+
159+
Parameters
160+
----------
161+
typ : object
162+
A type
163+
obj : object
164+
The value to use in the type constructor
165+
axes : dict
166+
The axes to use to construct the resulting pandas object
167+
168+
Returns
169+
-------
170+
ret : typ
171+
An object of type ``typ`` with the value `obj` and possible axes
172+
`axes`.
173+
"""
174+
try:
175+
typ = typ.type
176+
except AttributeError:
177+
pass
178+
179+
res_t = np.result_type(obj.dtype, dtype)
180+
181+
if not isinstance(typ, partial) and issubclass(typ, PandasObject):
182+
return typ(obj, dtype=res_t, **axes)
183+
184+
# special case for pathological things like ~True/~False
185+
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
186+
ret_value = res_t.type(obj)
187+
else:
188+
ret_value = typ(obj).astype(res_t)
189+
# The condition is to distinguish 0-dim array (returned in case of
190+
# scalar) and 1 element array
191+
# e.g. np.array(0) and np.array([0])
192+
if (
193+
len(obj.shape) == 1
194+
and len(obj) == 1
195+
and not isinstance(ret_value, np.ndarray)
196+
):
197+
ret_value = np.array([ret_value]).astype(res_t)
198+
199+
return ret_value

modin/core/computation/check.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
Forked from pandas.core.computation.check
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from pandas.compat._optional import import_optional_dependency
8+
9+
ne = import_optional_dependency("numexpr", errors="warn")
10+
NUMEXPR_INSTALLED = ne is not None
11+
12+
__all__ = ["NUMEXPR_INSTALLED"]

modin/core/computation/common.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""
2+
Forked from pandas.core.computation.common
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from functools import reduce
8+
9+
import numpy as np
10+
from pandas._config import get_option
11+
12+
13+
def ensure_decoded(s) -> str:
14+
"""
15+
If we have bytes, decode them to unicode.
16+
"""
17+
if isinstance(s, (np.bytes_, bytes)):
18+
s = s.decode(get_option("display.encoding"))
19+
return s
20+
21+
22+
def result_type_many(*arrays_and_dtypes):
23+
"""
24+
Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
25+
argument limit.
26+
"""
27+
try:
28+
return np.result_type(*arrays_and_dtypes)
29+
except ValueError:
30+
# we have > NPY_MAXARGS terms in our expression
31+
return reduce(np.result_type, arrays_and_dtypes)
32+
except TypeError:
33+
from pandas.core.dtypes.cast import find_common_type
34+
from pandas.core.dtypes.common import is_extension_array_dtype
35+
36+
arr_and_dtypes = list(arrays_and_dtypes)
37+
ea_dtypes, non_ea_dtypes = [], []
38+
for arr_or_dtype in arr_and_dtypes:
39+
if is_extension_array_dtype(arr_or_dtype):
40+
ea_dtypes.append(arr_or_dtype)
41+
else:
42+
non_ea_dtypes.append(arr_or_dtype)
43+
44+
if non_ea_dtypes:
45+
try:
46+
np_dtype = np.result_type(*non_ea_dtypes)
47+
except ValueError:
48+
np_dtype = reduce(np.result_type, arrays_and_dtypes)
49+
return find_common_type(ea_dtypes + [np_dtype])
50+
51+
return find_common_type(ea_dtypes)

0 commit comments

Comments
 (0)