Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ def predict(self, X, batch_size=None, n_jobs=1):

return predictions

def fit_ensemble(self, y, task=None, metric=None, precision='32',
def fit_ensemble(self, y, task=None, metric=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']:
Expand Down Expand Up @@ -1000,7 +1000,7 @@ def refit(self, X, y):

return super().refit(X, y)

def fit_ensemble(self, y, task=None, metric=None, precision='32',
def fit_ensemble(self, y, task=None, metric=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
_n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
Expand Down Expand Up @@ -1081,7 +1081,7 @@ def fit(
load_models=load_models,
)

def fit_ensemble(self, y, task=None, metric=None, precision='32',
def fit_ensemble(self, y, task=None, metric=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
y, _classes, _n_classes = self._process_target_classes(y)
Expand Down Expand Up @@ -1176,7 +1176,7 @@ def fit(
load_models=load_models,
)

def fit_ensemble(self, y, task=None, metric=None, precision='32',
def fit_ensemble(self, y, task=None, metric=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
y = super()._check_y(y)
Expand Down
162 changes: 79 additions & 83 deletions autosklearn/ensemble_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(
seed: int = 1,
shared_mode: bool = False,
max_iterations: int = None,
precision: str = "32",
precision: str = 32,
sleep_duration: int = 2,
memory_limit: int = 1000,
read_at_most: int = 5,
Expand Down Expand Up @@ -87,7 +87,7 @@ def __init__(
max_iterations: int
maximal number of iterations to run this script
(default None --> deactivated)
precision: ["16","32","64","128"]
precision: [16,32,64,128]
precision of floats to read the predictions
sleep_duration: int
duration of sleeping time between two iterations of this script (in sec)
Expand Down Expand Up @@ -249,7 +249,7 @@ def main(self, return_pred=False):
)

# populates self.read_preds
if not self.read_ensemble_preds():
if not self.score_ensemble_preds():
time.sleep(self.sleep_duration)
continue

Expand Down Expand Up @@ -319,9 +319,9 @@ def main(self, return_pred=False):
if return_pred:
return valid_pred, test_pred

def read_ensemble_preds(self):
def score_ensemble_preds(self):
"""
reading predictions on ensemble building data set;
score predictions on ensemble building data set;
populates self.read_preds
"""

Expand Down Expand Up @@ -411,40 +411,35 @@ def read_ensemble_preds(self):

# actually read the predictions and score them
try:
if y_ens_fn.endswith("gz"):
open_method = gzip.open
elif y_ens_fn.endswith("npy"):
open_method = open
else:
raise ValueError("Unknown filetype %s" % y_ens_fn)
with open_method(y_ens_fn, 'rb') as fp:
y_ensemble = self._read_np_fn(fp=fp)
score = calculate_score(solution=self.y_true_ensemble,
# y_ensemble = y_true for ensemble set
prediction=y_ensemble,
task_type=self.task_type,
metric=self.metric,
all_scoring_functions=False)

if self.read_preds[y_ens_fn]["ens_score"] > -1:
self.logger.debug(
'Changing ensemble score for file %s from %f to %f '
'because file modification time changed? %f - %f',
y_ens_fn,
self.read_preds[y_ens_fn]["ens_score"],
score,
self.read_preds[y_ens_fn]["mtime_ens"],
os.path.getmtime(y_ens_fn),
)

self.read_preds[y_ens_fn]["ens_score"] = score
self.read_preds[y_ens_fn][Y_ENSEMBLE] = y_ensemble
self.read_preds[y_ens_fn]["mtime_ens"] = os.path.getmtime(
y_ens_fn
y_ensemble = self._read_np_fn(y_ens_fn)
score = calculate_score(solution=self.y_true_ensemble,
prediction=y_ensemble,
task_type=self.task_type,
metric=self.metric,
all_scoring_functions=False)

if self.read_preds[y_ens_fn]["ens_score"] > -1:
self.logger.debug(
'Changing ensemble score for file %s from %f to %f '
'because file modification time changed? %f - %f',
y_ens_fn,
self.read_preds[y_ens_fn]["ens_score"],
score,
self.read_preds[y_ens_fn]["mtime_ens"],
os.path.getmtime(y_ens_fn),
)
self.read_preds[y_ens_fn]["loaded"] = 1

n_read_files += 1
self.read_preds[y_ens_fn]["ens_score"] = score

# It is not needed to create the object here
# To save memory, we just score the object.
# self.read_preds[y_ens_fn][Y_ENSEMBLE] = y_ensemble
self.read_preds[y_ens_fn]["mtime_ens"] = os.path.getmtime(
y_ens_fn
)
self.read_preds[y_ens_fn]["loaded"] = 2

n_read_files += 1

except Exception:
self.logger.warning(
Expand All @@ -468,7 +463,11 @@ def get_n_best_preds(self):
according to score on "ensemble set"
n: self.ensemble_nbest

Side effect: delete predictions of non-candidate models
Side effects:
->Define the n-best models to use in ensemble
->Only the best models are loaded
->Any model that is not best is candidate to deletion
if max models in disc is exceeded.
"""

sorted_keys = self._get_list_of_sorted_preds()
Expand Down Expand Up @@ -524,20 +523,6 @@ def get_n_best_preds(self):
)
keep_nbest = self.max_models_on_disc

for k, _, _ in sorted_keys[:keep_nbest]:
if self.read_preds[k][Y_ENSEMBLE] is None:
if k.endswith("gz"):
open_method = gzip.open
elif k.endswith("npy"):
open_method = open
else:
raise ValueError("Unknown filetype %s" % k)
with open_method(k, 'rb') as fp:
self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(fp=fp)
# No need to load valid and test here because they are loaded
# only if the model ends up in the ensemble
self.read_preds[k]['loaded'] = 1

# consider performance_range_threshold
if self.performance_range_threshold > 0:
best_score = sorted_keys[0][1]
Expand Down Expand Up @@ -576,6 +561,14 @@ def get_n_best_preds(self):
)
self.read_preds[k]['loaded'] = 2

# Load the predictions for the winning
for k in sorted_keys[:ensemble_n_best]:
if self.read_preds[k][Y_ENSEMBLE] is None:
self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k)
# No need to load valid and test here because they are loaded
# only if the model ends up in the ensemble
self.read_preds[k]['loaded'] = 1

# return best scored keys of self.read_preds
return sorted_keys[:ensemble_n_best]

Expand Down Expand Up @@ -635,17 +628,10 @@ def get_valid_test_preds(self, selected_keys: list):
success_keys_valid.append(k)
continue
try:
if valid_fn.endswith("gz"):
open_method = gzip.open
elif valid_fn.endswith("npy"):
open_method = open
else:
raise ValueError("Unknown filetype %s" % valid_fn)
with open_method(valid_fn, 'rb') as fp:
y_valid = self._read_np_fn(fp)
self.read_preds[k][Y_VALID] = y_valid
success_keys_valid.append(k)
self.read_preds[k]["mtime_valid"] = os.path.getmtime(valid_fn)
y_valid = self._read_np_fn(valid_fn)
self.read_preds[k][Y_VALID] = y_valid
success_keys_valid.append(k)
self.read_preds[k]["mtime_valid"] = os.path.getmtime(valid_fn)
except Exception:
self.logger.warning('Error loading %s: %s',
valid_fn, traceback.format_exc())
Expand All @@ -663,17 +649,10 @@ def get_valid_test_preds(self, selected_keys: list):
success_keys_test.append(k)
continue
try:
if test_fn.endswith("gz"):
open_method = gzip.open
elif test_fn.endswith("npy"):
open_method = open
else:
raise ValueError("Unknown filetype %s" % test_fn)
with open_method(test_fn, 'rb') as fp:
y_test = self._read_np_fn(fp)
self.read_preds[k][Y_TEST] = y_test
success_keys_test.append(k)
self.read_preds[k]["mtime_test"] = os.path.getmtime(test_fn)
y_test = self._read_np_fn(test_fn)
self.read_preds[k][Y_TEST] = y_test
success_keys_test.append(k)
self.read_preds[k]["mtime_test"] = os.path.getmtime(test_fn)
except Exception:
self.logger.warning('Error loading %s: %s',
test_fn, traceback.format_exc())
Expand Down Expand Up @@ -950,13 +929,30 @@ def _delete_excess_models(self):
for lock in locks:
lock.release()

def _read_np_fn(self, fp):
if self.precision == "16":
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float16)
elif self.precision == "32":
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float32)
elif self.precision == "64":
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float64)
def _read_np_fn(self, path):

# Support for string precision
if isinstance(self.precision, str):
precision = int(self.precision)
self.logger.warning("Interpreted str-precision as {}".format(
precision
))
else:
predictions = np.load(fp, allow_pickle=True)
return predictions
precision = self.precision

if path.endswith("gz"):
open_method = gzip.open
elif path.endswith("npy"):
open_method = open
else:
raise ValueError("Unknown filetype %s" % path)
with open_method(path, 'rb') as fp:
if precision == 16:
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float16)
elif precision == 32:
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float32)
elif precision == 64:
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float64)
else:
predictions = np.load(fp, allow_pickle=True)
return predictions
10 changes: 5 additions & 5 deletions test/test_ensemble_builder/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def testRead(self):
seed=0, # important to find the test files
)

success = ensbuilder.read_ensemble_preds()
success = ensbuilder.score_ensemble_preds()
self.assertTrue(success, str(ensbuilder.read_preds))
self.assertEqual(len(ensbuilder.read_preds), 3)

Expand Down Expand Up @@ -93,7 +93,7 @@ def testNBest(self):
max_models_on_disc=models_in_disc,
)

ensbuilder.read_ensemble_preds()
ensbuilder.score_ensemble_preds()
sel_keys = ensbuilder.get_n_best_preds()

self.assertEqual(len(sel_keys), exp)
Expand Down Expand Up @@ -168,7 +168,7 @@ def testFallBackNBest(self):
ensemble_nbest=1
)

ensbuilder.read_ensemble_preds()
ensbuilder.score_ensemble_preds()

filename = os.path.join(
self.backend.temporary_directory,
Expand Down Expand Up @@ -209,7 +209,7 @@ def testGetValidTestPreds(self):
ensemble_nbest=1
)

ensbuilder.read_ensemble_preds()
ensbuilder.score_ensemble_preds()

d1 = os.path.join(
self.backend.temporary_directory,
Expand Down Expand Up @@ -251,7 +251,7 @@ def testEntireEnsembleBuilder(self):
)
ensbuilder.SAVE2DISC = False

ensbuilder.read_ensemble_preds()
ensbuilder.score_ensemble_preds()

d2 = os.path.join(
self.backend.temporary_directory,
Expand Down