20
20
# measurements
21
21
22
22
import numpy as np
23
+ import pandas ._testing as tm
23
24
24
25
from .utils import (
25
26
generate_dataframe ,
@@ -127,12 +128,56 @@ def time_join(self, shapes, how, sort):
127
128
execute (self .df1 .join (self .df2 , how = how , lsuffix = "left_" , sort = sort ))
128
129
129
130
131
+ class TimeJoinStringIndex :
132
+ param_names = ["shapes" , "sort" ]
133
+ params = [
134
+ get_benchmark_shapes ("TimeJoinStringIndex" ),
135
+ [True , False ],
136
+ ]
137
+
138
+ def setup (self , shapes , sort ):
139
+ assert shapes [0 ] % 100 == 0 , "implementation restriction"
140
+ level1 = tm .makeStringIndex (10 ).values
141
+ level2 = tm .makeStringIndex (shapes [0 ] // 100 ).values
142
+ codes1 = np .arange (10 ).repeat (shapes [0 ] // 100 )
143
+ codes2 = np .tile (np .arange (shapes [0 ] // 100 ), 10 )
144
+ index2 = IMPL .MultiIndex (levels = [level1 , level2 ], codes = [codes1 , codes2 ])
145
+ self .df_multi = IMPL .DataFrame (
146
+ np .random .randn (len (index2 ), 4 ), index = index2 , columns = ["A" , "B" , "C" , "D" ]
147
+ )
148
+
149
+ self .key1 = np .tile (level1 .take (codes1 ), 10 )
150
+ self .key2 = np .tile (level2 .take (codes2 ), 10 )
151
+ self .df = generate_dataframe ("int" , * shapes , RAND_LOW , RAND_HIGH )
152
+ # just to keep source shape
153
+ self .df = self .df .drop (columns = self .df .columns [- 2 :])
154
+ self .df ["key1" ] = self .key1
155
+ self .df ["key2" ] = self .key2
156
+ execute (self .df )
157
+
158
+ self .df_key1 = IMPL .DataFrame (
159
+ np .random .randn (len (level1 ), 4 ), index = level1 , columns = ["A" , "B" , "C" , "D" ]
160
+ )
161
+ self .df_key2 = IMPL .DataFrame (
162
+ np .random .randn (len (level2 ), 4 ), index = level2 , columns = ["A" , "B" , "C" , "D" ]
163
+ )
164
+
165
+ def time_join_dataframe_index_multi (self , shapes , sort ):
166
+ execute (self .df .join (self .df_multi , on = ["key1" , "key2" ], sort = sort ))
167
+
168
+ def time_join_dataframe_index_single_key_bigger (self , shapes , sort ):
169
+ execute (self .df .join (self .df_key2 , on = "key2" , sort = sort ))
170
+
171
+ def time_join_dataframe_index_single_key_small (self , shapes , sort ):
172
+ execute (self .df .join (self .df_key1 , on = "key1" , sort = sort ))
173
+
174
+
130
175
class TimeMerge :
131
176
param_names = ["shapes" , "how" , "sort" ]
132
177
params = [
133
178
get_benchmark_shapes ("TimeMerge" ),
134
179
["left" , "inner" ],
135
- [False ],
180
+ [True , False ],
136
181
]
137
182
138
183
def setup (self , shapes , how , sort ):
@@ -147,6 +192,19 @@ def time_merge(self, shapes, how, sort):
147
192
)
148
193
)
149
194
195
+ def time_merge_default (self , shapes , how , sort ):
196
+ execute (IMPL .merge (self .df1 , self .df2 , how = how , sort = sort ))
197
+
198
+ def time_merge_dataframe_empty_right (self , shapes , how , sort ):
199
+ # Getting an empty dataframe using `iloc` should be very fast,
200
+ # so the impact on the time of the merge operation should be negligible.
201
+ execute (IMPL .merge (self .df1 , self .df2 .iloc [:0 ], how = how , sort = sort ))
202
+
203
+ def time_merge_dataframe_empty_left (self , shapes , how , sort ):
204
+ # Getting an empty dataframe using `iloc` should be very fast,
205
+ # so the impact on the time of the merge operation should be negligible.
206
+ execute (IMPL .merge (self .df1 .iloc [:0 ], self .df2 , how = how , sort = sort ))
207
+
150
208
151
209
class TimeMergeCategoricals :
152
210
param_names = ["shapes" , "data_type" ]
@@ -759,3 +817,6 @@ def time_columns(self, shape):
759
817
760
818
def time_index (self , shape ):
761
819
return self .df .index
820
+
821
+
822
+ from .utils import setup # noqa: E402, F401
0 commit comments