-
Notifications
You must be signed in to change notification settings - Fork 36
Description
I want to load larger than memory data which exceeds the cluster's entire memory summation.
To be specific, I want to take advantage of the feature Using XGBoost External Memory Version — xgboost 2.1.0-dev documentation and Experimental support for external memory — xgboost 2.1.0-dev documentation.
I found RayDataIter but seems it is only been used when it founds it is using legacy XGBoost (< 1.5.0 I think, without DataIter).
xgboost_ray/xgboost_ray/matrix.py
Lines 43 to 49 in 9081780
| try: | |
| from xgboost.core import DataIter | |
| LEGACY_MATRIX = False | |
| except ImportError: | |
| DataIter = object | |
| LEGACY_MATRIX = True |
xgboost_ray/xgboost_ray/main.py
Lines 365 to 431 in 9081780
| def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix: | |
| if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix): | |
| if isinstance(param["data"], list): | |
| qdm_param = _prepare_dmatrix_params(param) | |
| param.update(qdm_param) | |
| if data.enable_categorical is not None: | |
| param["enable_categorical"] = data.enable_categorical | |
| matrix = xgb.QuantileDMatrix(**param) | |
| if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix): | |
| # If we only got a single data shard, create a list so we can | |
| # iterate over it | |
| if not isinstance(param["data"], list): | |
| param["data"] = [param["data"]] | |
| if not isinstance(param["label"], list): | |
| param["label"] = [param["label"]] | |
| if not isinstance(param["weight"], list): | |
| param["weight"] = [param["weight"]] | |
| if not isinstance(param["feature_weights"], list): | |
| param["feature_weights"] = [param["feature_weights"]] | |
| if not isinstance(param["qid"], list): | |
| param["qid"] = [param["qid"]] | |
| if not isinstance(param["data"], list): | |
| param["base_margin"] = [param["base_margin"]] | |
| param["label_lower_bound"] = [None] | |
| param["label_upper_bound"] = [None] | |
| dm_param = { | |
| "feature_names": data.feature_names, | |
| "feature_types": data.feature_types, | |
| "missing": data.missing, | |
| } | |
| if data.enable_categorical is not None: | |
| dm_param["enable_categorical"] = data.enable_categorical | |
| param.update(dm_param) | |
| it = RayDataIter(**param) | |
| matrix = xgb.DeviceQuantileDMatrix(it, **dm_param) | |
| else: | |
| if isinstance(param["data"], list): | |
| dm_param = _prepare_dmatrix_params(param) | |
| param.update(dm_param) | |
| ll = param.pop("label_lower_bound", None) | |
| lu = param.pop("label_upper_bound", None) | |
| fw = param.pop("feature_weights", None) | |
| if LEGACY_MATRIX: | |
| param.pop("base_margin", None) | |
| if "qid" not in inspect.signature(xgb.DMatrix).parameters: | |
| param.pop("qid", None) | |
| if data.enable_categorical is not None: | |
| param["enable_categorical"] = data.enable_categorical | |
| matrix = xgb.DMatrix(**param) | |
| if not LEGACY_MATRIX: | |
| matrix.set_info( | |
| label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw | |
| ) | |
| data.update_matrix_properties(matrix) | |
| return matrix |
Maybe it is better that we can construct XGBoost DMatrix with customized DataIter instead of concatenating all the data at once.
xgboost_ray/xgboost_ray/main.py
Line 423 in 9081780
| matrix = xgb.DMatrix(**param) |
xgboost_ray/xgboost_ray/main.py
Lines 351 to 362 in 9081780
| def _prepare_dmatrix_params(param: Dict) -> Dict: | |
| dm_param = { | |
| "data": concat_dataframes(param["data"]), | |
| "label": concat_dataframes(param["label"]), | |
| "weight": concat_dataframes(param["weight"]), | |
| "feature_weights": concat_dataframes(param["feature_weights"]), | |
| "qid": concat_dataframes(param["qid"]), | |
| "base_margin": concat_dataframes(param["base_margin"]), | |
| "label_lower_bound": concat_dataframes(param["label_lower_bound"]), | |
| "label_upper_bound": concat_dataframes(param["label_upper_bound"]), | |
| } | |
| return dm_param |