Skip to content

Commit 2afe0c9

Browse files
committed
Merge branch 'main' into doc_build
2 parents 83de78f + 1fb3d12 commit 2afe0c9

File tree

5 files changed

+26
-9
lines changed

5 files changed

+26
-9
lines changed

tpot/evolvers/steady_state_evolver.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ def optimize(self):
444444
print("Cancelled future (likely memory related)")
445445
scores = [np.nan for _ in range(len(self.objective_names))]
446446
eval_error = "INVALID"
447+
client.run(gc.collect)
447448
else: #if the future is done and did not throw an error, get the scores
448449
try:
449450
scores = completed_future.result()
@@ -466,13 +467,14 @@ def optimize(self):
466467
print("cancelld ", completed_future.cancelled())
467468
scores = [np.nan for _ in range(len(self.objective_names))]
468469
eval_error = "INVALID"
470+
completed_future.release() #release the future
469471
else: #if future is not done
470472

471473
if self.max_eval_time_mins is not None:
472474
#check if the future has been running for too long, cancel the future
473475
if time.time() - submitted_futures[completed_future]["time"] > self.max_eval_time_mins*1.25*60:
474476
completed_future.cancel()
475-
477+
completed_future.release() #release the future
476478
if self.verbose >= 4:
477479
print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n')
478480

@@ -506,6 +508,8 @@ def optimize(self):
506508
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
507509
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")
508510

511+
#I am not entirely sure if this is necessary. I believe that calling release on the futures should be enough to free up memory. If memory issues persist, this may be a good place to start.
512+
#client.run(gc.collect) #run garbage collection to free up memory
509513

510514
###############################
511515
# Step 2: Early Stopping
@@ -717,6 +721,10 @@ def optimize(self):
717721
#done, cleanup futures
718722
for future in submitted_futures.keys():
719723
future.cancel()
724+
future.release() #release the future
725+
726+
#I am not entirely sure if this is necessary. I believe that calling release on the futures should be enough to free up memory. If memory issues persist, this may be a good place to start.
727+
#client.run(gc.collect) #run garbage collection to free up memory
720728

721729
#checkpoint
722730
if self.population_file is not None:

tpot/tests/test_estimators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def tpot_estimator_with_pipeline(tpot_estimator,sample_dataset):
8585
tpot_estimator.fit(sample_dataset[0], sample_dataset[1])
8686
return tpot_estimator
8787

88-
@pytest.mark.skip(reason="Errors out, skipping to build docs")
88+
# @pytest.mark.skip(reason="Errors out, skipping to build docs")
8989
def test_tpot_estimator_predict(tpot_estimator_with_pipeline,sample_dataset):
9090
#X_test = [[1, 2, 3], [4, 5, 6]]
9191
X_test = sample_dataset[0]

tpot/tpot_estimator/estimator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,15 +581,15 @@ def fit(self, X, y):
581581
if self.categorical_features is not None: #if categorical features are specified, use those
582582
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
583583
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
584-
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))
584+
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001))) # retain wrong param fix
585585

586586
else:
587587
if isinstance(X, pd.DataFrame):
588588
categorical_columns = X.select_dtypes(include=['object']).columns
589589
if len(categorical_columns) > 0:
590590
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
591591
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
592-
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
592+
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001))) # retain wrong param fix
593593
else:
594594
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
595595
else:

tpot/tpot_estimator/steady_state_estimator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,15 +624,15 @@ def fit(self, X, y):
624624
if self.categorical_features is not None: #if categorical features are specified, use those
625625
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
626626
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
627-
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))
627+
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))
628628

629629
else:
630630
if isinstance(X, pd.DataFrame):
631631
categorical_columns = X.select_dtypes(include=['object']).columns
632632
if len(categorical_columns) > 0:
633633
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
634634
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
635-
pipeline_steps.append(("impute_categorical", tpot.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
635+
pipeline_steps.append(("ColumnOneHotEncoder", tpot.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
636636
else:
637637
pipeline_steps.append(("impute_numeric", tpot.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
638638
else:

tpot/utils/eval_utils.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from dask.distributed import progress
5050
import distributed
5151
import func_timeout
52+
import gc
5253

5354
def process_scores(scores, n):
5455
'''
@@ -163,6 +164,7 @@ def parallel_eval_objective_list(individual_list,
163164
print("Cancelled future (likely memory related)")
164165
scores = [np.nan for _ in range(n_expected_columns)]
165166
eval_error = "INVALID"
167+
client.run(gc.collect)
166168
else: #if the future is done and did not throw an error, get the scores
167169
try:
168170
scores = completed_future.result()
@@ -186,20 +188,23 @@ def parallel_eval_objective_list(individual_list,
186188
print("cancelld ", completed_future.cancelled())
187189
scores = [np.nan for _ in range(n_expected_columns)]
188190
eval_error = "INVALID"
191+
192+
completed_future.release() #release the future
189193
else: #if future is not done
190194

191195
# check if the future has been running for too long, cancel the future
192196
# we multiply max_eval_time_mins by 1.25 since the objective function in the future should be able to cancel itself. This is a backup in case it doesn't.
193197
if max_eval_time_mins is not None and time.time() - submitted_futures[completed_future]["time"] > max_eval_time_mins*1.25*60:
194198
completed_future.cancel()
195-
199+
completed_future.release()
196200
if verbose >= 4:
197201
print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n')
198202

199203
scores = [np.nan for _ in range(n_expected_columns)]
200204
eval_error = "TIMEOUT"
201205
elif global_timeout_triggered:
202206
completed_future.cancel()
207+
completed_future.release()
203208

204209
if verbose >= 4:
205210
print(f'WARNING AN INDIVIDUAL TIMED OUT (max_time_mins): \n {submitted_futures[completed_future]} \n')
@@ -222,6 +227,10 @@ def parallel_eval_objective_list(individual_list,
222227
#update submitted futures
223228
submitted_futures.pop(completed_future)
224229

230+
231+
#I am not entirely sure if this is necessary. I believe that calling release on the futures should be enough to free up memory. If memory issues persist, this may be a good place to start.
232+
#client.run(gc.collect) #run garbage collection to free up memory
233+
225234
#break if timeout
226235
if global_timeout_triggered:
227236
while len(individual_stack) > 0:
@@ -243,10 +252,10 @@ def parallel_eval_objective_list(individual_list,
243252

244253
submitted_inds.add(individual.unique_id())
245254

255+
#I am not entirely sure if this is necessary. I believe that calling release on the futures should be enough to free up memory. If memory issues persist, this may be a good place to start.
256+
#client.run(gc.collect) #run garbage collection to free up memory
246257

247258
#collect remaining futures
248-
249-
250259
final_scores = [scores_dict[individual]["scores"] for individual in individual_list]
251260
final_start_times = [scores_dict[individual]["start_time"] for individual in individual_list]
252261
final_end_times = [scores_dict[individual]["end_time"] for individual in individual_list]

0 commit comments

Comments
 (0)