Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
- Added support for `DataFrame.to_excel` and `Series.to_excel`.
- Added support for `pd.read_feather`.

#### Bug Fixes
- Fixed a bug in hybrid execution mode (PrPr) where certain Series operations would raise `TypeError: numpy.ndarray object is not callable`.
- Fixed a bug in hybrid execution mode (PrPr) where calling numpy operations like `np.where` on modin objects with the Pandas backend would raise an `AttributeError`. This fix requires `modin` version 0.34.0 or newer.

## 1.33.0 (2025-06-19)

### Snowpark Python API Updates
Expand Down
1 change: 1 addition & 0 deletions src/snowflake/snowpark/modin/plugin/_internal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
# Snowpark pandas supports the newest two released versions of modin; update this flag and remove legacy
# code as needed when we bump dependency versions.
MODIN_IS_AT_LEAST_0_33_0 = version.parse(pd.__version__) >= version.parse("0.33.0")
MODIN_IS_AT_LEAST_0_34_0 = version.parse(pd.__version__) >= version.parse("0.34.0")


# This is the default statement parameters for queries from Snowpark pandas API. It provides the fine grain metric for
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from typing import Any, Callable, List, Literal, Optional, TypeVar, Union, get_args

import modin.pandas as pd
from modin.pandas import Series, DataFrame
from modin.pandas.base import BasePandasDataset
import numpy as np
import numpy.typing as npt
import pandas as native_pd
Expand Down Expand Up @@ -813,6 +815,7 @@ def move_to_cost(
or (
is_dict_like(data)
and not isinstance(data, native_pd.DataFrame)
and not isinstance(data, native_pd.Series)
and all(
isinstance(v, pd.Series)
and isinstance(v._query_compiler, type(self))
Expand Down Expand Up @@ -1191,6 +1194,119 @@ def to_numpy(
)
return self.to_pandas().to_numpy(dtype=dtype, na_value=na_value, **kwargs)

# modin 0.34 and above
def do_array_ufunc_implementation(
self,
frame: BasePandasDataset,
ufunc: np.ufunc,
method: str,
*inputs: Any,
**kwargs: Any,
) -> Union[DataFrame, Series, Any]:
"""
Apply the provided NumPy ufunc to the underlying data.

This method is called by the ``__array_ufunc__`` dispatcher on BasePandasDataset.

Unlike other query compiler methods, this function directly operates on the input DataFrame/Series
to allow for easier argument processing. The default implementation defaults to pandas, but
a query compiler sub-class may override this method to provide a distributed implementation.

See NumPy docs: https://numpy.org/doc/stable/user/basics.subclassing.html#array-ufunc-for-ufuncs

Parameters
----------
frame : BasePandasDataset
The DataFrame or Series on which the ufunc was called. Its query compiler must match ``self``.

ufunc : np.ufunc
The function to apply.

method : str
The name of the function to apply.

*inputs : Any
Positional arguments to pass to ``ufunc``.

**kwargs : Any
Keyword arguments to pass to ``ufunc``.

Returns
-------
DataFrame, Series, or Any
The result of applying the ufunc to ``frame``.
"""
assert (
self is frame._query_compiler
), "array ufunc called with mismatched query compiler and input frame"
# Use pandas version of ufunc if it exists
if method != "__call__":
# Return sentinel value NotImplemented
return NotImplemented # pragma: no cover
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_universal_func_map,
)

if ufunc.__name__ in numpy_to_pandas_universal_func_map:
ufunc = numpy_to_pandas_universal_func_map[ufunc.__name__]
if ufunc == NotImplemented:
return NotImplemented
# We cannot support the out argument
if kwargs.get("out") is not None:
return NotImplemented
return ufunc(frame, inputs[1:])
# return the sentinel NotImplemented if we do not support this function
return NotImplemented # pragma: no cover

# modin 0.34 and above
def do_array_function_implementation(
self,
frame: BasePandasDataset,
func: Callable,
types: tuple,
args: tuple,
kwargs: dict,
) -> Union[DataFrame, Series, Any]:
"""
Apply the provided NumPy array function to the underlying data.

This method is called by the ``__array_function__`` dispatcher on BasePandasDataset.

Unlike other query compiler methods, this function directly operates on the input DataFrame/Series
to allow for easier argument processing. The default implementation defaults to pandas, but
a query compiler sub-class may override this method to provide a distributed implementation.

See NumPy docs: https://numpy.org/neps/nep-0018-array-function-protocol.html#nep18

Parameters
----------
frame : BasePandasDataset
The DataFrame or Series on which the ufunc was called. Its query compiler must match ``self``.
func : np.func
The NumPy func to apply.
types : tuple
The types of the args.
args : tuple
The args to the func.
kwargs : dict
Additional keyword arguments.

Returns
-------
DataFrame | Series | Any
The result of applying the function to this dataset. Unlike modin, which returns a
numpy array by default, this will return a DataFrame/Series object or NotImplemented.
"""
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_func_map,
)

if func.__name__ in numpy_to_pandas_func_map:
return numpy_to_pandas_func_map[func.__name__](*args, **kwargs)
else:
# per NEP18 we raise NotImplementedError so that numpy can intercept
return NotImplemented # pragma: no cover

def repartition(self, axis: Any = None) -> "SnowflakeQueryCompiler":
# let Snowflake handle partitioning, it makes no sense to repartition the dataframe.
return self
Expand Down Expand Up @@ -2331,7 +2447,6 @@ def _binary_op_list_like_rhs_axis_0(
If data in both corresponding DataFrame locations is missing the result will be missing.
only arithmetic binary operation has this parameter (e.g., add() has, but eq() doesn't have).
"""
from modin.pandas import Series

# Step 1: Convert other to a Series and join on the row position with self.
other_qc = Series(other)._query_compiler
Expand Down Expand Up @@ -2481,8 +2596,6 @@ def binary_op(

# Native pandas does not support binary operations between a Series and a list-like object.

from modin.pandas import Series
from modin.pandas.dataframe import DataFrame
from modin.pandas.utils import is_scalar

# fail explicitly for unsupported scenarios
Expand Down Expand Up @@ -10739,8 +10852,6 @@ def getitem_row_array(
New QueryCompiler that contains specified rows.
"""

from modin.pandas import Series

# convert key to internal frame via Series
key_frame = None
if isinstance(key, Series):
Expand Down
61 changes: 34 additions & 27 deletions src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,42 @@
"""

from .base_overrides import register_base_override
from snowflake.snowpark.modin.plugin._internal.utils import (
MODIN_IS_AT_LEAST_0_34_0,
)

# TODO delete this file once modin 0.33.x is no longer supported
if not MODIN_IS_AT_LEAST_0_34_0:

@register_base_override("__array_function__")
def __array_function__(self, func: callable, types: tuple, args: tuple, kwargs: dict):
"""
Apply the `func` to the `BasePandasDataset`.
@register_base_override("__array_function__")
def __array_function__(
self, func: callable, types: tuple, args: tuple, kwargs: dict
):
"""
Apply the `func` to the `BasePandasDataset`.

Parameters
----------
func : np.func
The NumPy func to apply.
types : tuple
The types of the args.
args : tuple
The args to the func.
kwargs : dict
Additional keyword arguments.
Parameters
----------
func : np.func
The NumPy func to apply.
types : tuple
The types of the args.
args : tuple
The args to the func.
kwargs : dict
Additional keyword arguments.

Returns
-------
BasePandasDataset
The result of the ufunc applied to the `BasePandasDataset`.
"""
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_func_map,
)
Returns
-------
BasePandasDataset
The result of the ufunc applied to the `BasePandasDataset`.
"""
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_func_map,
)

if func.__name__ in numpy_to_pandas_func_map:
return numpy_to_pandas_func_map[func.__name__](*args, **kwargs)
else:
# per NEP18 we raise NotImplementedError so that numpy can intercept
return NotImplemented # pragma: no cover
if func.__name__ in numpy_to_pandas_func_map:
return numpy_to_pandas_func_map[func.__name__](*args, **kwargs)
else:
# per NEP18 we raise NotImplementedError so that numpy can intercept
return NotImplemented # pragma: no cover
80 changes: 43 additions & 37 deletions src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@
validate_percentile,
)

from snowflake.snowpark.modin.plugin._internal.utils import MODIN_IS_AT_LEAST_0_33_0
from snowflake.snowpark.modin.plugin._internal.utils import (
MODIN_IS_AT_LEAST_0_33_0,
MODIN_IS_AT_LEAST_0_34_0,
)
from snowflake.snowpark.modin.plugin._typing import ListLike
from snowflake.snowpark.modin.plugin.extensions.utils import (
ensure_index,
Expand Down Expand Up @@ -2372,45 +2375,48 @@ def f(x):


# Snowpark pandas has custom dispatch logic for ufuncs, while modin defaults to pandas.
@register_base_override("__array_ufunc__")
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Apply the `ufunc` to the `BasePandasDataset`.
# TODO delete this method once 0.33.x is no longer supported
if MODIN_IS_AT_LEAST_0_34_0:

Parameters
----------
ufunc : np.ufunc
The NumPy ufunc to apply.
method : str
The method to apply.
*inputs : tuple
The inputs to the ufunc.
**kwargs : dict
Additional keyword arguments.
@register_base_override("__array_ufunc__")
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Apply the `ufunc` to the `BasePandasDataset`.

Parameters
----------
ufunc : np.ufunc
The NumPy ufunc to apply.
method : str
The method to apply.
*inputs : tuple
The inputs to the ufunc.
**kwargs : dict
Additional keyword arguments.

Returns
-------
BasePandasDataset
The result of the ufunc applied to the `BasePandasDataset`.
"""
# Use pandas version of ufunc if it exists
if method != "__call__":
# Return sentinel value NotImplemented
return NotImplemented # pragma: no cover
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_universal_func_map,
)

Returns
-------
BasePandasDataset
The result of the ufunc applied to the `BasePandasDataset`.
"""
# Use pandas version of ufunc if it exists
if method != "__call__":
# Return sentinel value NotImplemented
if ufunc.__name__ in numpy_to_pandas_universal_func_map:
ufunc = numpy_to_pandas_universal_func_map[ufunc.__name__]
if ufunc == NotImplemented:
return NotImplemented
# We cannot support the out argument
if kwargs.get("out") is not None:
return NotImplemented
return ufunc(self, inputs[1:])
# return the sentinel NotImplemented if we do not support this function
return NotImplemented # pragma: no cover
from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import (
numpy_to_pandas_universal_func_map,
)

if ufunc.__name__ in numpy_to_pandas_universal_func_map:
ufunc = numpy_to_pandas_universal_func_map[ufunc.__name__]
if ufunc == NotImplemented:
return NotImplemented
# We cannot support the out argument
if kwargs.get("out") is not None:
return NotImplemented
return ufunc(self, inputs[1:])
# return the sentinel NotImplemented if we do not support this function
return NotImplemented # pragma: no cover


# Snowpark pandas does extra argument validation.
Expand Down
16 changes: 15 additions & 1 deletion tests/integ/modin/hybrid/test_df_creation_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# We're comparing an object with the native pandas backend, so we use the pandas testing utility
# here rather than our own internal one.
from pandas.testing import assert_series_equal
from pandas.testing import assert_series_equal, assert_frame_equal

import modin.pandas as pd
from modin.config import context as config_context
Expand Down Expand Up @@ -119,3 +119,17 @@ def test_constructor_does_not_double_move():
assert pd.DataFrame(pandas_df).get_backend() == "Pandas"
assert pd.DataFrame({"col0": pandas_df[0]}).get_backend() == "Pandas"
assert pd.DataFrame(pandas_df[0]).get_backend() == "Pandas"


@sql_count_checker(query_count=0)
def test_native_series_argument():
# SNOW-2173648: Operations like this assignment failed in QueryCompiler.move_to_me_cost()
df = pd.DataFrame({"a": [1, 2, 3]})
result_frame = pd.DataFrame(df["a"].to_pandas())
assert result_frame.get_backend() == "Pandas"
assert_frame_equal(
result_frame.to_pandas(),
native_pd.DataFrame({"a": [1, 2, 3]}),
check_column_type=False,
check_index_type=False,
)
Loading
Loading