5
5
6
6
import os
7
7
import imp
8
+ import shutil
8
9
9
10
import numpy as np
10
11
import pandas as pd
@@ -157,13 +158,36 @@ def assert_submission(ramp_kit_dir='.', ramp_data_dir='.',
157
158
def blend_submissions (submissions , ramp_kit_dir = '.' , ramp_data_dir = '.' ,
158
159
ramp_submission_dir = '.' , save_output = False ,
159
160
min_improvement = 0.0 ):
161
+ """Blending submissions in a ramp-kit and compute contributivities.
162
+
163
+ If save_output is True, we create three files:
164
+ <ramp_submission_dir>/training_output/contributivities.csv
165
+ <ramp_submission_dir>/training_output/bagged_scores_combined.csv
166
+ <ramp_submission_dir>/training_output/bagged_scores_foldwise_best.csv
167
+
168
+ Parameters
169
+ ----------
170
+ submissions : list of str
171
+ List of submission names (folders in <ramp_submission_dir>).
172
+ ramp_kit_dir : str, default='.'
173
+ The directory of the ramp-kit to be blended.
174
+ ramp_data_dir : str, default='.'
175
+ The directory of the data.
176
+ ramp_submission_dir : str, default='./submissions'
177
+ The directory of the submissions.
178
+ save_output : bool, default is False
179
+ Whether to store the blending results.
180
+ min_improvement : float, default is 0.0
181
+ The minimum improvement under which greedy blender is stopped.
182
+ """
160
183
problem = assert_read_problem (ramp_kit_dir )
161
184
print_title ('Blending {}' .format (problem .problem_title ))
162
185
X_train , y_train , X_test , y_test = assert_data (ramp_kit_dir , ramp_data_dir )
163
186
cv = assert_cv (ramp_kit_dir , ramp_data_dir )
164
187
valid_is_list = [valid_is for (train_is , valid_is ) in cv ]
165
188
score_types = assert_score_types (ramp_kit_dir )
166
- contributivitys = np .zeros (len (submissions ))
189
+ n_folds = len (valid_is_list )
190
+ contributivitys = np .zeros ((len (submissions ), n_folds ))
167
191
168
192
combined_predictions_valid_list = []
169
193
foldwise_best_predictions_valid_list = []
@@ -198,34 +222,55 @@ def blend_submissions(submissions, ramp_kit_dir='.', ramp_data_dir='.',
198
222
# we share a unit of 1. among the contributive submissions
199
223
unit_contributivity = 1. / len (best_index_list )
200
224
for i in best_index_list :
201
- contributivitys [i ] += unit_contributivity
225
+ contributivitys [i , fold_i ] += unit_contributivity
202
226
203
227
combined_predictions_valid_list .append (
204
- problem .Predictions .combine (predictions_valid_list ))
205
- foldwise_best_predictions_valid_list .append (predictions_valid_list [0 ])
228
+ problem .Predictions .combine (
229
+ predictions_valid_list , best_index_list ))
230
+ foldwise_best_predictions_valid_list .append (
231
+ predictions_valid_list [best_index_list [0 ]])
206
232
combined_predictions_test_list .append (
207
- problem .Predictions .combine (predictions_test_list ))
208
- foldwise_best_predictions_test_list .append (predictions_test_list [0 ])
233
+ problem .Predictions .combine (
234
+ predictions_test_list , best_index_list ))
235
+ foldwise_best_predictions_test_list .append (
236
+ predictions_test_list [best_index_list [0 ]])
209
237
210
- contributivitys /= len ( cv )
238
+ contributivitys /= n_folds
211
239
contributivitys_df = pd .DataFrame ()
212
240
contributivitys_df ['submission' ] = np .array (submissions )
213
- contributivitys_df ['contributivity' ] = np .round (contributivitys , 3 )
214
- contributivitys_df = contributivitys_df .reset_index ()
241
+ contributivitys_df ['contributivity' ] = np .zeros (len (submissions ))
242
+ for fold_i in range (n_folds ):
243
+ c_i = contributivitys [:, fold_i ]
244
+ contributivitys_df ['fold_{}' .format (fold_i )] = c_i
245
+ contributivitys_df ['contributivity' ] += c_i
246
+ percentage_factor = 100 / contributivitys_df ['contributivity' ].sum ()
247
+ contributivitys_df ['contributivity' ] *= percentage_factor
248
+ rounded = contributivitys_df ['contributivity' ].round ().astype (int )
249
+ contributivitys_df ['contributivity' ] = rounded
215
250
contributivitys_df = contributivitys_df .sort_values (
216
251
'contributivity' , ascending = False )
217
252
print (contributivitys_df .to_string (index = False ))
218
253
219
- training_output_path = os .path .join (ramp_kit_dir , 'training_output' )
220
- if not os .path .exists (training_output_path ):
221
- os .mkdir (training_output_path )
254
+ if save_output :
255
+ training_output_path = os .path .join (
256
+ ramp_submission_dir , 'training_output' )
257
+ if not os .path .exists (training_output_path ):
258
+ os .mkdir (training_output_path )
259
+ contributivitys_df .to_csv (os .path .join (
260
+ training_output_path , 'contributivities.csv' ), index = False )
261
+
222
262
# bagging the foldwise ensembles
223
263
bag_submissions (
224
264
problem , cv , y_train , y_test , combined_predictions_valid_list ,
225
265
combined_predictions_test_list , training_output_path ,
226
266
ramp_data_dir = ramp_data_dir , score_type_index = 0 ,
227
267
save_output = save_output , score_table_title = 'Combined bagged scores' ,
228
268
score_f_name_prefix = 'foldwise_best' )
269
+ if save_output :
270
+ shutil .move (
271
+ os .path .join (training_output_path , 'bagged_scores.csv' ),
272
+ os .path .join (training_output_path , 'bagged_scores_combined.csv' ))
273
+
229
274
# bagging the foldwise best submissions
230
275
bag_submissions (
231
276
problem , cv , y_train , y_test , foldwise_best_predictions_valid_list ,
@@ -234,3 +279,8 @@ def blend_submissions(submissions, ramp_kit_dir='.', ramp_data_dir='.',
234
279
save_output = save_output ,
235
280
score_table_title = 'Foldwise best bagged scores' ,
236
281
score_f_name_prefix = 'combined' )
282
+ if save_output :
283
+ shutil .move (
284
+ os .path .join (training_output_path , 'bagged_scores.csv' ),
285
+ os .path .join (
286
+ training_output_path , 'bagged_scores_foldwise_best.csv' ))
0 commit comments