Skip to content

Commit abcf1e9

Browse files
authored
BENCH: add some cases for join and merge ops from pandas (#5021)
Signed-off-by: Myachev <[email protected]>
1 parent d005429 commit abcf1e9

File tree

4 files changed

+74
-1
lines changed

4 files changed

+74
-1
lines changed

asv_bench/benchmarks/benchmarks.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# measurements
2121

2222
import numpy as np
23+
import pandas._testing as tm
2324

2425
from .utils import (
2526
generate_dataframe,
@@ -127,12 +128,56 @@ def time_join(self, shapes, how, sort):
127128
execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
128129

129130

131+
class TimeJoinStringIndex:
132+
param_names = ["shapes", "sort"]
133+
params = [
134+
get_benchmark_shapes("TimeJoinStringIndex"),
135+
[True, False],
136+
]
137+
138+
def setup(self, shapes, sort):
139+
assert shapes[0] % 100 == 0, "implementation restriction"
140+
level1 = tm.makeStringIndex(10).values
141+
level2 = tm.makeStringIndex(shapes[0] // 100).values
142+
codes1 = np.arange(10).repeat(shapes[0] // 100)
143+
codes2 = np.tile(np.arange(shapes[0] // 100), 10)
144+
index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
145+
self.df_multi = IMPL.DataFrame(
146+
np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
147+
)
148+
149+
self.key1 = np.tile(level1.take(codes1), 10)
150+
self.key2 = np.tile(level2.take(codes2), 10)
151+
self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
152+
# just to keep source shape
153+
self.df = self.df.drop(columns=self.df.columns[-2:])
154+
self.df["key1"] = self.key1
155+
self.df["key2"] = self.key2
156+
execute(self.df)
157+
158+
self.df_key1 = IMPL.DataFrame(
159+
np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
160+
)
161+
self.df_key2 = IMPL.DataFrame(
162+
np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
163+
)
164+
165+
def time_join_dataframe_index_multi(self, shapes, sort):
166+
execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))
167+
168+
def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
169+
execute(self.df.join(self.df_key2, on="key2", sort=sort))
170+
171+
def time_join_dataframe_index_single_key_small(self, shapes, sort):
172+
execute(self.df.join(self.df_key1, on="key1", sort=sort))
173+
174+
130175
class TimeMerge:
131176
param_names = ["shapes", "how", "sort"]
132177
params = [
133178
get_benchmark_shapes("TimeMerge"),
134179
["left", "inner"],
135-
[False],
180+
[True, False],
136181
]
137182

138183
def setup(self, shapes, how, sort):
@@ -147,6 +192,19 @@ def time_merge(self, shapes, how, sort):
147192
)
148193
)
149194

195+
def time_merge_default(self, shapes, how, sort):
196+
execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))
197+
198+
def time_merge_dataframe_empty_right(self, shapes, how, sort):
199+
# Getting an empty dataframe using `iloc` should be very fast,
200+
# so the impact on the time of the merge operation should be negligible.
201+
execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))
202+
203+
def time_merge_dataframe_empty_left(self, shapes, how, sort):
204+
# Getting an empty dataframe using `iloc` should be very fast,
205+
# so the impact on the time of the merge operation should be negligible.
206+
execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))
207+
150208

151209
class TimeMergeCategoricals:
152210
param_names = ["shapes", "data_type"]
@@ -759,3 +817,6 @@ def time_columns(self, shape):
759817

760818
def time_index(self, shape):
761819
return self.df.index
820+
821+
822+
from .utils import setup # noqa: E402, F401

asv_bench/benchmarks/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
random_booleans,
3333
translator_groupby_ngroups,
3434
trigger_import,
35+
setup,
3536
)
3637

3738
__all__ = [
@@ -54,4 +55,5 @@
5455
"random_booleans",
5556
"translator_groupby_ngroups",
5657
"trigger_import",
58+
"setup",
5759
]

asv_bench/benchmarks/utils/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,3 +594,10 @@ def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
594594
df.to_parquet(test_filenames[shape_id], index=False)
595595

596596
return test_filenames
597+
598+
599+
def setup(*args, **kwargs): # noqa: GL08
600+
# This function just needs to be imported into each benchmark file to
601+
# set up the random seed before each function. ASV run it automatically.
602+
# https://asv.readthedocs.io/en/latest/writing_benchmarks.html
603+
np.random.seed(42)

asv_bench/benchmarks/utils/data_shapes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,9 @@
169169
DEFAULT_CONFIG["MergeCategoricals"] = (
170170
[[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
171171
)
172+
DEFAULT_CONFIG["TimeJoinStringIndex"] = (
173+
[[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
174+
)
172175
for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T):
173176
for _shape, _names in config:
174177
DEFAULT_CONFIG.update({_name: _shape for _name in _names})

0 commit comments

Comments
 (0)