automl · mfeurer · Jun 9, 2020 · May 29, 2020
diff --git a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py
@@ -2,6 +2,7 @@
 import pandas as pd
 
 from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import MinMaxScaler
 import sklearn.utils
 
 from ....util.logging_ import get_logger
@@ -18,6 +19,7 @@ def __init__(self, metric='l1', random_state=None, metric_params=None):
         self.runs = None
         self.best_configuration_per_dataset = None
         self.random_state = sklearn.utils.check_random_state(random_state)
+        self.scaler = MinMaxScaler()
 
         if self.metric_params is None:
             self.metric_params = {}
@@ -44,6 +46,9 @@ def fit(self, metafeatures, runs):
         self.runs = runs
         self.num_datasets = runs.shape[1]
 
+        # Fit the metafeatures for scaler
+        self.scaler.fit(self.metafeatures)
+
         # for each dataset, sort the runs according to their result
         best_configuration_per_dataset = {}
         for dataset_name in runs:
@@ -102,8 +107,9 @@ def kNearestDatasets(self, x, k=1, return_distance=False):
         elif k == -1:
             k = self.num_datasets
 
-        X_train, x = self._scale(self.metafeatures, x)
+        X_train = self.scaler.transform(self.metafeatures)
         x = x.values.reshape((1, -1))
+        x = self.scaler.transform(x)
         self._nearest_neighbors.fit(X_train)
         distances, neighbor_indices = self._nearest_neighbors.kneighbors(
             x, n_neighbors=k, return_distance=True)
@@ -126,6 +132,7 @@ def kBestSuggestions(self, x, k=1, exclude_double_configurations=True):
             raise ValueError('Number of neighbors k cannot be zero or negative.')
         nearest_datasets, distances = self.kNearestDatasets(x, -1,
                                                             return_distance=True)
+
         kbest = []
 
         added_configurations = set()
@@ -151,20 +158,3 @@ def kBestSuggestions(self, x, k=1, exclude_double_configurations=True):
         if k == -1:
             k = len(kbest)
         return kbest[:k]
-
-    def _scale(self, metafeatures, other):
-        assert isinstance(other, pd.Series), type(other)
-        assert other.values.dtype == np.float64
-        scaled_metafeatures = metafeatures.copy(deep=True)
-        other = other.copy(deep=True)
-
-        mins = scaled_metafeatures.min()
-        maxs = scaled_metafeatures.max()
-        # I also need to scale the target dataset meta features...
-        mins = pd.DataFrame(data=[mins, other]).min()
-        maxs = pd.DataFrame(data=[maxs, other]).max()
-        divisor = (maxs-mins)
-        divisor[divisor == 0] = 1
-        scaled_metafeatures = (scaled_metafeatures - mins) / divisor
-        other = (other - mins) / divisor
-        return scaled_metafeatures, other
diff --git a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py
@@ -42,21 +42,21 @@ def test_kNearestDatasets(self):
         neighbor, distance = kND.kNearestDatasets(self.anneal, 1,
                                                   return_distance=True)
         self.assertEqual([233], neighbor)
-        np.testing.assert_array_almost_equal([1.82298937], distance)
+        np.testing.assert_array_almost_equal([3.8320802803440586], distance)
 
         neighbors = kND.kNearestDatasets(self.anneal, 2)
         self.assertEqual([233, 234], neighbors)
         neighbors, distance = kND.kNearestDatasets(self.anneal, 2,
                                                    return_distance=True)
         self.assertEqual([233, 234], neighbors)
-        np.testing.assert_array_almost_equal([1.822989, 2.267919], distance)
+        np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance)
 
         neighbors = kND.kNearestDatasets(self.anneal, -1)
         self.assertEqual([233, 234], neighbors)
         neighbors, distance = kND.kNearestDatasets(self.anneal, -1,
                                                    return_distance=True)
         self.assertEqual([233, 234], neighbors)
-        np.testing.assert_array_almost_equal([1.822989, 2.267919], distance)
+        np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance)
 
         self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, 0)
         self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, -2)
@@ -67,35 +67,23 @@ def test_kBestSuggestions(self):
                 self.runs.loc[:, [233, 234]])
         neighbor = kND.kBestSuggestions(self.anneal, 1)
         np.testing.assert_array_almost_equal(
-            [(233, 1.8229893712531495, 1)],
+            [(233, 3.8320802803440586, 1)],
             neighbor,
         )
         neighbors = kND.kBestSuggestions(self.anneal, 2)
         np.testing.assert_array_almost_equal(
-            [(233, 1.8229893712531495, 1), (234, 2.2679197196559415, 2)],
+            [(233, 3.8320802803440586, 1), (234, 4.367919719655942, 2)],
             neighbors,
         )
         neighbors = kND.kBestSuggestions(self.anneal, -1)
         np.testing.assert_array_almost_equal(
-            [(233, 1.8229893712531495, 1), (234, 2.2679197196559415, 2)],
+            [(233, 3.8320802803440586, 1), (234, 4.367919719655942, 2)],
             neighbors,
         )
 
         self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, 0)
         self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, -2)
 
-    def test_scale(self):
-        kND = KNearestDatasets()
-        metafeatures = pd.DataFrame([self.anneal, self.krvskp])
-        metafeatures, other = kND._scale(metafeatures, self.labor)
-        from pandas.util.testing import assert_series_equal
-        # Series.equal does not work properly with floats...
-        assert_series_equal(metafeatures.iloc[0],
-                            pd.Series({"number_of_instances": 0.267919719656,
-                                       "number_of_classes": 1,
-                                       "number_of_features": 1},
-                            name=232))
-
     def test_random_metric(self):
         kND = KNearestDatasets(metric=get_random_metric(random_state=1))
         kND.fit(pd.DataFrame([self.krvskp, self.labor]),