Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions examples/20_basic/example_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@
import sklearn.metrics

import autosklearn.regression
import matplotlib.pyplot as plt


############################################################################
############################
# Data Loading
# ============

X, y = sklearn.datasets.load_boston(return_X_y=True)
X, y = sklearn.datasets.load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = \
sklearn.model_selection.train_test_split(X, y, random_state=1)

############################################################################
###########################
# Build and fit a regressor
# =========================

Expand All @@ -32,17 +32,43 @@
tmp_folder='/tmp/autosklearn_regression_example_tmp',
output_folder='/tmp/autosklearn_regression_example_out',
)
automl.fit(X_train, y_train, dataset_name='boston')
automl.fit(X_train, y_train, dataset_name='diabetes')

############################################################################
######################################################
# Print the final ensemble constructed by auto-sklearn
# ====================================================

print(automl.show_models())

###########################################################################
#####################################
# Get the Score of the final ensemble
# ===================================
# After training the estimator, we can now quantify the goodness of fit. One possibility for
# is the `R2 score <https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score>`_.
# The values range between -inf and 1 with 1 being the best possible value. A dummy estimator
# predicting the data mean has an R2 score of 0.

train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))

######################
# Plot the predictions
# ====================
# Furthermore, we can now visually inspect the predictions. We plot the true value against the
# predictions and show results on train and test data. Points on the diagonal depict perfect
# predictions. Points below the diagonal were overestimated by the model (predicted value is higher
# than the true value), points above the diagonal were underestimated (predicted value is lower than
# the true value).

predictions = automl.predict(X_test)
print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
plt.scatter(train_predictions, y_train, label="Train samples", c='#d95f02')
plt.scatter(test_predictions, y_test, label="Test samples", c='#7570b3')
plt.xlabel("Predicted value")
plt.ylabel("True value")
plt.legend()
plt.plot([30, 400], [30, 400], c='k', zorder=0)
plt.xlim([30, 400])
plt.ylim([30, 400])
plt.tight_layout()
plt.show()
112 changes: 112 additions & 0 deletions examples/40_advanced/example_inspect_predictions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- encoding: utf-8 -*-
"""
=================
Model Explanation
=================

The following example shows how to fit a simple classification model with
*auto-sklearn* and use the `inspect module <https://scikit-learn.org/stable/inspection.html>`_ from
scikit-learn to understand what affects the predictions.
"""
import sklearn.datasets
from sklearn.inspection import plot_partial_dependence, permutation_importance
import matplotlib.pyplot as plt
import autosklearn.classification


#############################
# Load Data and Build a Model
# ===========================
#
# We start by loading the "Run or walk" dataset from OpenML and train an auto-sklearn model on it.
# For this dataset, the goal is to predict whether a person is running or walking based on
# accelerometer and gyroscope data collected by a phone. For more information see
# `here <https://www.openml.org/d/40922>`_.

dataset = sklearn.datasets.fetch_openml(data_id=40922)

# Note: To speed up the example, we subsample the dataset
dataset.data = dataset.data.sample(n=5000, random_state=1, axis="index")
dataset.target = dataset.target[dataset.data.index]
X_train, X_test, y_train, y_test = \
sklearn.model_selection.train_test_split(dataset.data, dataset.target, test_size=0.3,
random_state=1)

automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
tmp_folder='/tmp/autosklearn_inspect_predictions_example_tmp',
output_folder='/tmp/autosklearn_classification_example_out',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's no longer necessary to provide an output folder.

n_jobs=1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not necessary to give the number of jobs.

)
automl.fit(X_train, y_train, dataset_name='Run_or_walk_information')

s = automl.score(X_train, y_train)
print(f"Train score {s}")
s = automl.score(X_test, y_test)
print(f"Test score {s}")

#########################################
# Compute permutation importance - part 1
# =======================================
#
# Since auto-sklearn implements the scikit-learn interface, it can be used with the scikit-learn's
# inspection module. So, now we first look at the `permutation importance
# <https://christophm.github.io/interpretable-ml-book/feature-importance.html>`_, which defines the
# decrease in a model score when a given feature is randomly permuted. So, the higher the score,
# the more does the model's predictions depend on this feature.
#
# **Note:** There are some pitfalls in interpreting these numbers, which can be found
# in the `scikit-learn docs <https://scikit-learn.org/stable/modules/permutation_importance.html>`_.

r = permutation_importance(automl, X_test, y_test,
n_repeats=10,
random_state=0)

sort_idx = r.importances_mean.argsort()[::-1]
plt.boxplot(r.importances[sort_idx].T, labels=[dataset.feature_names[i] for i in sort_idx])
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

for i in sort_idx[::-1]:
print(f"{dataset.feature_names[i]:10s}: {r.importances_mean[i]:.3f} +/- "
f"{r.importances_std[i]:.3f}")

############################################################################################
# Create partial dependence (PD) and individual conditional expectation (ICE) plots - part 2
# ==========================================================================================
#
# `ICE plots <https://christophm.github.io/interpretable-ml-book/ice.html>`_ describe the relation
# between feature values and the response value for each sample
# individually -- it shows how the response value changes if the value of one feature is changed.
#
# `PD plots <https://christophm.github.io/interpretable-ml-book/pdp.html>`_ describe the relation
# between feature values and the response value, i.e. the expected
# response value wrt. one or multiple input features. Since we use a classification dataset, this
# corresponds to the predicted class probability.
#
# Since ``acceleration_y`` and ``acceleration_z`` turned out to have the largest impact on the response
# value according to the permutation dependence, we'll first look at them and generate a plot
# combining ICE (thin lines) and PD (thick line)

features = [1, 2]
plot_partial_dependence(automl, dataset.data, features=features, grid_resolution=5,
kind="both", feature_names=dataset.feature_names,
)
plt.tight_layout()
plt.show()

##########################################################################
# Create partial dependence (PDP) plots for more than one feature - part 3
# ========================================================================
#
# A PD plot can also be generated for two features and thus allow to inspect the interaction between
# these features. Again, we'll look at acceleration_y and acceleration_z.

features = [[1, 2]]
plot_partial_dependence(automl, dataset.data, features=features, grid_resolution=5,
feature_names=dataset.feature_names,
)
plt.tight_layout()
plt.show()