Skip to content

Commit 0d837ef

Browse files
authored
Fixing blending (#194)
* [DOC] Add circleci and sphinx documentation (#185) * add sphinx doc * add to using ramp workflow * add docs * create initial circleCI * amend doc * [empty] trigger circleCI * dictate branch * remove branch specification * [mrg] fixing issues with tests not passing on appveyor (#186) * Update README.md * checking the test_notebook_testing path * debugging convert_notebook * checking where exactly in execute-notebook is the problem * changed titanic starting kit not to have !command * hashing out the Executing Shell Commands (!) in jupyter notebooks * cleanup * updated link for appveyor * fixing blending * fixing blending cli * fixing blending * make blending more readable * creating training_output before using it * flake * assert cv * flake * getting rid of python 2 in travis
1 parent f48b024 commit 0d837ef

File tree

6 files changed

+74
-25
lines changed

6 files changed

+74
-25
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ branches:
55
only:
66
- master
77
env:
8-
- PYTHON_VERSION=2.7
98
- PYTHON_VERSION=3.5
109
- PYTHON_VERSION=3.6
1110
before_install:

rampwf/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from . import score_types
55
from . import utils
66
from . import workflows
7+
from ._version import get_versions
78

89

910
__all__ = [
@@ -15,7 +16,5 @@
1516
'workflows',
1617
]
1718

18-
19-
from ._version import get_versions
2019
__version__ = get_versions()['version']
2120
del get_versions

rampwf/score_types/tests/test_detection.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ def test_average_precision():
177177

178178
conf, ps, rs = precision_recall_curve_greedy(y_true, y_pred)
179179
assert conf.tolist() == [0.9, 0.8, 0.7, 0.6, 0.5, 0.4]
180-
assert ps.tolist() == [1, 1, 2/3, 3/4, 3/5, 3/6] # noqa
181-
assert rs.tolist() == [1/4, 2/4, 2/4, 3/4, 3/4, 3/4] # noqa
180+
assert ps.tolist() == [1, 1, 2 / 3, 3 / 4, 3 / 5, 3 / 6] # noqa
181+
assert rs.tolist() == [1 / 4, 2 / 4, 2 / 4, 3 / 4, 3 / 4, 3 / 4] # noqa
182182
assert ap(y_true, y_pred) == 11 / 16 # 0.5 * 1 + 0.25 * 3/4 + 0.25 * 0
183183

184184

rampwf/utils/combine.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,11 @@ def _get_next_best_submission(predictions_list, ground_truths,
124124
Predictions, predictions_list, new_index_list)
125125
new_score = score_type.score_function(
126126
ground_truths, combined_predictions)
127-
iltb = score_type.is_lower_the_better
128-
nltb = new_score < best_score - min_improvement
129-
bltn = new_score > best_score + min_improvement
130-
if (iltb and nltb) or (not iltb and bltn):
127+
if score_type.is_lower_the_better:
128+
is_improved = new_score < best_score - min_improvement
129+
else:
130+
is_improved = new_score > best_score + min_improvement
131+
if is_improved:
131132
best_predictions = combined_predictions
132133
best_index = i
133134
best_score = new_score
@@ -167,14 +168,14 @@ def blend_on_fold(predictions_list, ground_truths_valid, score_type,
167168
best_prediction_index = np.argmax(valid_scores)
168169
score = valid_scores[best_prediction_index]
169170
best_index_list = np.array([best_prediction_index])
170-
improvement = True
171-
while improvement and len(best_index_list) < max_n_ensemble:
171+
is_improved = True
172+
while is_improved and len(best_index_list) < max_n_ensemble:
172173
print('\t{}: {}'.format(best_index_list, score))
173174
old_best_index_list = best_index_list
174175
best_index_list, score = _get_next_best_submission(
175176
predictions_list, ground_truths_valid, score_type, best_index_list,
176177
min_improvement)
177-
improvement = len(best_index_list) != len(old_best_index_list)
178+
is_improved = len(best_index_list) != len(old_best_index_list)
178179
return best_index_list
179180
# we share a unit of 1. among the contributive submissions
180181
# unit_contributivity = 1. / len(best_index_list)

rampwf/utils/command_line.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def create_ramp_blend_submissions_parser():
157157
' Specify submissions separated by a comma without'
158158
' spaces. If "ALL", all submissions in the directory'
159159
' will be blended.')
160-
parser.add_argument('--save_output', dest='save_output',
160+
parser.add_argument('--save-output', dest='save_output',
161161
action='store_true',
162162
help='Specify this flag to save predictions '
163163
'after blending.')

rampwf/utils/testing.py

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import os
77
import imp
8+
import shutil
89

910
import numpy as np
1011
import pandas as pd
@@ -157,13 +158,36 @@ def assert_submission(ramp_kit_dir='.', ramp_data_dir='.',
157158
def blend_submissions(submissions, ramp_kit_dir='.', ramp_data_dir='.',
158159
ramp_submission_dir='.', save_output=False,
159160
min_improvement=0.0):
161+
"""Blending submissions in a ramp-kit and compute contributivities.
162+
163+
If save_output is True, we create three files:
164+
<ramp_submission_dir>/training_output/contributivities.csv
165+
<ramp_submission_dir>/training_output/bagged_scores_combined.csv
166+
<ramp_submission_dir>/training_output/bagged_scores_foldwise_best.csv
167+
168+
Parameters
169+
----------
170+
submissions : list of str
171+
List of submission names (folders in <ramp_submission_dir>).
172+
ramp_kit_dir : str, default='.'
173+
The directory of the ramp-kit to be blended.
174+
ramp_data_dir : str, default='.'
175+
The directory of the data.
176+
ramp_submission_dir : str, default='./submissions'
177+
The directory of the submissions.
178+
save_output : bool, default is False
179+
Whether to store the blending results.
180+
min_improvement : float, default is 0.0
181+
The minimum improvement under which greedy blender is stopped.
182+
"""
160183
problem = assert_read_problem(ramp_kit_dir)
161184
print_title('Blending {}'.format(problem.problem_title))
162185
X_train, y_train, X_test, y_test = assert_data(ramp_kit_dir, ramp_data_dir)
163186
cv = assert_cv(ramp_kit_dir, ramp_data_dir)
164187
valid_is_list = [valid_is for (train_is, valid_is) in cv]
165188
score_types = assert_score_types(ramp_kit_dir)
166-
contributivitys = np.zeros(len(submissions))
189+
n_folds = len(valid_is_list)
190+
contributivitys = np.zeros((len(submissions), n_folds))
167191

168192
combined_predictions_valid_list = []
169193
foldwise_best_predictions_valid_list = []
@@ -198,34 +222,55 @@ def blend_submissions(submissions, ramp_kit_dir='.', ramp_data_dir='.',
198222
# we share a unit of 1. among the contributive submissions
199223
unit_contributivity = 1. / len(best_index_list)
200224
for i in best_index_list:
201-
contributivitys[i] += unit_contributivity
225+
contributivitys[i, fold_i] += unit_contributivity
202226

203227
combined_predictions_valid_list.append(
204-
problem.Predictions.combine(predictions_valid_list))
205-
foldwise_best_predictions_valid_list.append(predictions_valid_list[0])
228+
problem.Predictions.combine(
229+
predictions_valid_list, best_index_list))
230+
foldwise_best_predictions_valid_list.append(
231+
predictions_valid_list[best_index_list[0]])
206232
combined_predictions_test_list.append(
207-
problem.Predictions.combine(predictions_test_list))
208-
foldwise_best_predictions_test_list.append(predictions_test_list[0])
233+
problem.Predictions.combine(
234+
predictions_test_list, best_index_list))
235+
foldwise_best_predictions_test_list.append(
236+
predictions_test_list[best_index_list[0]])
209237

210-
contributivitys /= len(cv)
238+
contributivitys /= n_folds
211239
contributivitys_df = pd.DataFrame()
212240
contributivitys_df['submission'] = np.array(submissions)
213-
contributivitys_df['contributivity'] = np.round(contributivitys, 3)
214-
contributivitys_df = contributivitys_df.reset_index()
241+
contributivitys_df['contributivity'] = np.zeros(len(submissions))
242+
for fold_i in range(n_folds):
243+
c_i = contributivitys[:, fold_i]
244+
contributivitys_df['fold_{}'.format(fold_i)] = c_i
245+
contributivitys_df['contributivity'] += c_i
246+
percentage_factor = 100 / contributivitys_df['contributivity'].sum()
247+
contributivitys_df['contributivity'] *= percentage_factor
248+
rounded = contributivitys_df['contributivity'].round().astype(int)
249+
contributivitys_df['contributivity'] = rounded
215250
contributivitys_df = contributivitys_df.sort_values(
216251
'contributivity', ascending=False)
217252
print(contributivitys_df.to_string(index=False))
218253

219-
training_output_path = os.path.join(ramp_kit_dir, 'training_output')
220-
if not os.path.exists(training_output_path):
221-
os.mkdir(training_output_path)
254+
if save_output:
255+
training_output_path = os.path.join(
256+
ramp_submission_dir, 'training_output')
257+
if not os.path.exists(training_output_path):
258+
os.mkdir(training_output_path)
259+
contributivitys_df.to_csv(os.path.join(
260+
training_output_path, 'contributivities.csv'), index=False)
261+
222262
# bagging the foldwise ensembles
223263
bag_submissions(
224264
problem, cv, y_train, y_test, combined_predictions_valid_list,
225265
combined_predictions_test_list, training_output_path,
226266
ramp_data_dir=ramp_data_dir, score_type_index=0,
227267
save_output=save_output, score_table_title='Combined bagged scores',
228268
score_f_name_prefix='foldwise_best')
269+
if save_output:
270+
shutil.move(
271+
os.path.join(training_output_path, 'bagged_scores.csv'),
272+
os.path.join(training_output_path, 'bagged_scores_combined.csv'))
273+
229274
# bagging the foldwise best submissions
230275
bag_submissions(
231276
problem, cv, y_train, y_test, foldwise_best_predictions_valid_list,
@@ -234,3 +279,8 @@ def blend_submissions(submissions, ramp_kit_dir='.', ramp_data_dir='.',
234279
save_output=save_output,
235280
score_table_title='Foldwise best bagged scores',
236281
score_f_name_prefix='combined')
282+
if save_output:
283+
shutil.move(
284+
os.path.join(training_output_path, 'bagged_scores.csv'),
285+
os.path.join(
286+
training_output_path, 'bagged_scores_foldwise_best.csv'))

0 commit comments

Comments
 (0)