Compare commits
1 Commits
create-pul
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
650f8c17e0 |
41
.github/workflows/python_tests.yml
vendored
41
.github/workflows/python_tests.yml
vendored
@@ -255,44 +255,3 @@ jobs:
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
|
||||
|
||||
python-system-installation-on-ubuntu:
|
||||
name: Test XGBoost Python package System Installation on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
|
||||
- name: Install ninja
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y ninja-build
|
||||
|
||||
- name: Build XGBoost on Ubuntu
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja
|
||||
ninja
|
||||
|
||||
- name: Copy lib to system lib
|
||||
run: |
|
||||
cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
|
||||
|
||||
- name: Install XGBoost in Virtual Environment
|
||||
run: |
|
||||
cd python-package
|
||||
pip install virtualenv
|
||||
virtualenv venv
|
||||
source venv/bin/activate && \
|
||||
pip install -v . --config-settings use_system_libxgboost=True && \
|
||||
python -c 'import xgboost'
|
||||
|
||||
17
NEWS.md
17
NEWS.md
@@ -3,23 +3,6 @@ XGBoost Change Log
|
||||
|
||||
This file records the changes in xgboost library in reverse chronological order.
|
||||
|
||||
## 1.7.6 (2023 Jun 16)
|
||||
|
||||
This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
|
||||
|
||||
### Bug Fixes
|
||||
* Fix distributed training with mixed dense and sparse partitions. (#9272)
|
||||
* Fix monotone constraints on CPU with large trees. (#9122)
|
||||
* [spark] Make the spark model have the same UID as its estimator (#9022)
|
||||
* Optimize prediction with `QuantileDMatrix`. (#9096)
|
||||
|
||||
### Document
|
||||
* Improve doxygen (#8959)
|
||||
* Update the cuDF pip index URL. (#9106)
|
||||
|
||||
### Maintenance
|
||||
* Fix tests with pandas 2.0. (#9014)
|
||||
|
||||
## 1.7.5 (2023 Mar 30)
|
||||
This is a patch release for bug fixes.
|
||||
|
||||
|
||||
@@ -85,18 +85,9 @@ test_that("dart prediction works", {
|
||||
rnorm(100)
|
||||
|
||||
set.seed(1994)
|
||||
booster_by_xgboost <- xgboost(
|
||||
data = d,
|
||||
label = y,
|
||||
max_depth = 2,
|
||||
booster = "dart",
|
||||
rate_drop = 0.5,
|
||||
one_drop = TRUE,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = nrounds,
|
||||
objective = "reg:squarederror"
|
||||
)
|
||||
booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
|
||||
rate_drop = 0.5, one_drop = TRUE,
|
||||
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
|
||||
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
|
||||
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
|
||||
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
|
||||
@@ -106,19 +97,19 @@ test_that("dart prediction works", {
|
||||
|
||||
set.seed(1994)
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
|
||||
booster_by_train <- xgb.train(
|
||||
params = list(
|
||||
booster = "dart",
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
rate_drop = 0.5,
|
||||
one_drop = TRUE,
|
||||
nthread = 1,
|
||||
objective = "reg:squarederror"
|
||||
),
|
||||
data = dtrain,
|
||||
nrounds = nrounds
|
||||
)
|
||||
booster_by_train <- xgb.train(params = list(
|
||||
booster = "dart",
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
rate_drop = 0.5,
|
||||
one_drop = TRUE,
|
||||
nthread = 1,
|
||||
tree_method = "exact",
|
||||
objective = "reg:squarederror"
|
||||
),
|
||||
data = dtrain,
|
||||
nrounds = nrounds
|
||||
)
|
||||
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
|
||||
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
|
||||
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
|
||||
@@ -408,7 +399,7 @@ test_that("colsample_bytree works", {
|
||||
xgb.importance(model = bst)
|
||||
# If colsample_bytree works properly, a variety of features should be used
|
||||
# in the 100 trees
|
||||
expect_gte(nrow(xgb.importance(model = bst)), 28)
|
||||
expect_gte(nrow(xgb.importance(model = bst)), 30)
|
||||
})
|
||||
|
||||
test_that("Configuration works", {
|
||||
|
||||
@@ -76,20 +76,32 @@ test_that("Models from previous versions of XGBoost can be loaded", {
|
||||
name <- m[3]
|
||||
is_rds <- endsWith(model_file, '.rds')
|
||||
is_json <- endsWith(model_file, '.json')
|
||||
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
|
||||
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
|
||||
booster <- readRDS(model_file)
|
||||
expect_warning(predict(booster, newdata = pred_data))
|
||||
booster <- readRDS(model_file)
|
||||
expect_warning(run_booster_check(booster, name))
|
||||
} else {
|
||||
if (is_rds) {
|
||||
|
||||
cpp_warning <- capture.output({
|
||||
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
|
||||
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
|
||||
booster <- readRDS(model_file)
|
||||
expect_warning(predict(booster, newdata = pred_data))
|
||||
booster <- readRDS(model_file)
|
||||
expect_warning(run_booster_check(booster, name))
|
||||
} else {
|
||||
booster <- xgb.load(model_file)
|
||||
if (is_rds) {
|
||||
booster <- readRDS(model_file)
|
||||
} else {
|
||||
booster <- xgb.load(model_file)
|
||||
}
|
||||
predict(booster, newdata = pred_data)
|
||||
run_booster_check(booster, name)
|
||||
}
|
||||
predict(booster, newdata = pred_data)
|
||||
run_booster_check(booster, name)
|
||||
})
|
||||
cpp_warning <- paste0(cpp_warning, collapse = ' ')
|
||||
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') >= 0) {
|
||||
# Expect a C++ warning when a model is loaded from RDS and it was generated by old XGBoost`
|
||||
m <- grepl(paste0('.*If you are loading a serialized model ',
|
||||
'\\(like pickle in Python, RDS in R\\).*',
|
||||
'for more details about differences between ',
|
||||
'saving model and serializing.*'), cpp_warning, perl = TRUE)
|
||||
expect_true(length(m) > 0 && all(m))
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
@@ -13,10 +13,7 @@ test_that("updating the model works", {
|
||||
watchlist <- list(train = dtrain, test = dtest)
|
||||
|
||||
# no-subsampling
|
||||
p1 <- list(
|
||||
objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
|
||||
updater = "grow_colmaker,prune"
|
||||
)
|
||||
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
|
||||
set.seed(11)
|
||||
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||
tr1 <- xgb.model.dt.tree(model = bst1)
|
||||
|
||||
@@ -18,11 +18,13 @@
|
||||
publisher={Institute of Mathematical Statistics}
|
||||
}
|
||||
|
||||
|
||||
@misc{
|
||||
Bache+Lichman:2013 ,
|
||||
author = "K. Bache and M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "https://archive.ics.uci.edu/",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences"
|
||||
url = "http://archive.ics.uci.edu/ml/",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences"
|
||||
}
|
||||
|
||||
|
||||
@@ -11,43 +11,33 @@ import numpy as np
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
plt.rcParams.update({"font.size": 13})
|
||||
|
||||
plt.rcParams.update({'font.size': 13})
|
||||
|
||||
# Function to visualize censored labels
|
||||
def plot_censored_labels(
|
||||
X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
|
||||
) -> None:
|
||||
def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
|
||||
def plot_censored_labels(X, y_lower, y_upper):
|
||||
def replace_inf(x, target_value):
|
||||
x[np.isinf(x)] = target_value
|
||||
return x
|
||||
|
||||
plt.plot(X, y_lower, "o", label="y_lower", color="blue")
|
||||
plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
|
||||
plt.vlines(
|
||||
X,
|
||||
ymin=replace_inf(y_lower, 0.01),
|
||||
ymax=replace_inf(y_upper, 1000.0),
|
||||
label="Range for y",
|
||||
color="gray",
|
||||
)
|
||||
|
||||
plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
|
||||
plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
|
||||
plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
|
||||
label='Range for y', color='gray')
|
||||
|
||||
# Toy data
|
||||
X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
|
||||
INF = np.inf
|
||||
y_lower = np.array([10, 15, -INF, 30, 100])
|
||||
y_upper = np.array([INF, INF, 20, 50, INF])
|
||||
y_lower = np.array([ 10, 15, -INF, 30, 100])
|
||||
y_upper = np.array([INF, INF, 20, 50, INF])
|
||||
|
||||
# Visualize toy data
|
||||
plt.figure(figsize=(5, 4))
|
||||
plot_censored_labels(X, y_lower, y_upper)
|
||||
plt.ylim((6, 200))
|
||||
plt.legend(loc="lower right")
|
||||
plt.title("Toy data")
|
||||
plt.xlabel("Input feature")
|
||||
plt.ylabel("Label")
|
||||
plt.yscale("log")
|
||||
plt.legend(loc='lower right')
|
||||
plt.title('Toy data')
|
||||
plt.xlabel('Input feature')
|
||||
plt.ylabel('Label')
|
||||
plt.yscale('log')
|
||||
plt.tight_layout()
|
||||
plt.show(block=True)
|
||||
|
||||
@@ -56,83 +46,54 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))
|
||||
|
||||
# Train AFT model using XGBoost
|
||||
dmat = xgb.DMatrix(X)
|
||||
dmat.set_float_info("label_lower_bound", y_lower)
|
||||
dmat.set_float_info("label_upper_bound", y_upper)
|
||||
params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}
|
||||
dmat.set_float_info('label_lower_bound', y_lower)
|
||||
dmat.set_float_info('label_upper_bound', y_upper)
|
||||
params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}
|
||||
|
||||
accuracy_history = []
|
||||
def plot_intermediate_model_callback(env):
|
||||
"""Custom callback to plot intermediate models"""
|
||||
# Compute y_pred = prediction using the intermediate model, at current boosting iteration
|
||||
y_pred = env.model.predict(dmat)
|
||||
# "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
|
||||
# the corresponding predicted label (y_pred)
|
||||
acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
|
||||
accuracy_history.append(acc)
|
||||
|
||||
# Plot ranged labels as well as predictions by the model
|
||||
plt.subplot(5, 3, env.iteration + 1)
|
||||
plot_censored_labels(X, y_lower, y_upper)
|
||||
y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
|
||||
plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
|
||||
plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
|
||||
plt.xlim((0.8, 5.2))
|
||||
plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
|
||||
plt.yscale('log')
|
||||
|
||||
class PlotIntermediateModel(xgb.callback.TrainingCallback):
|
||||
"""Custom callback to plot intermediate models."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def after_iteration(
|
||||
self,
|
||||
model: xgb.Booster,
|
||||
epoch: int,
|
||||
evals_log: xgb.callback.TrainingCallback.EvalsLog,
|
||||
) -> bool:
|
||||
"""Run after training is finished."""
|
||||
# Compute y_pred = prediction using the intermediate model, at current boosting
|
||||
# iteration
|
||||
y_pred = model.predict(dmat)
|
||||
# "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
|
||||
# includes the corresponding predicted label (y_pred)
|
||||
acc = np.sum(
|
||||
np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
|
||||
)
|
||||
accuracy_history.append(acc)
|
||||
|
||||
# Plot ranged labels as well as predictions by the model
|
||||
plt.subplot(5, 3, epoch + 1)
|
||||
plot_censored_labels(X, y_lower, y_upper)
|
||||
y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
|
||||
plt.plot(
|
||||
grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
|
||||
)
|
||||
plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
|
||||
plt.xlim((0.8, 5.2))
|
||||
plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
|
||||
plt.yscale("log")
|
||||
return False
|
||||
|
||||
|
||||
res: xgb.callback.TrainingCallback.EvalsLog = {}
|
||||
plt.figure(figsize=(12, 13))
|
||||
bst = xgb.train(
|
||||
params,
|
||||
dmat,
|
||||
15,
|
||||
[(dmat, "train")],
|
||||
evals_result=res,
|
||||
callbacks=[PlotIntermediateModel()],
|
||||
)
|
||||
res = {}
|
||||
plt.figure(figsize=(12,13))
|
||||
bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
|
||||
callbacks=[plot_intermediate_model_callback])
|
||||
plt.tight_layout()
|
||||
plt.legend(
|
||||
loc="lower center",
|
||||
ncol=4,
|
||||
bbox_to_anchor=(0.5, 0),
|
||||
bbox_transform=plt.gcf().transFigure,
|
||||
)
|
||||
plt.legend(loc='lower center', ncol=4,
|
||||
bbox_to_anchor=(0.5, 0),
|
||||
bbox_transform=plt.gcf().transFigure)
|
||||
plt.tight_layout()
|
||||
|
||||
# Plot negative log likelihood over boosting iterations
|
||||
plt.figure(figsize=(8, 3))
|
||||
plt.figure(figsize=(8,3))
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
|
||||
plt.xlabel("# Boosting Iterations")
|
||||
plt.legend(loc="best")
|
||||
plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
|
||||
plt.xlabel('# Boosting Iterations')
|
||||
plt.legend(loc='best')
|
||||
|
||||
# Plot "accuracy" over boosting iterations
|
||||
# "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
|
||||
# the corresponding predicted label (y_pred)
|
||||
plt.subplot(1, 2, 2)
|
||||
plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
|
||||
plt.xlabel("# Boosting Iterations")
|
||||
plt.legend(loc="best")
|
||||
plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
|
||||
plt.xlabel('# Boosting Iterations')
|
||||
plt.legend(loc='best')
|
||||
plt.tight_layout()
|
||||
|
||||
plt.show()
|
||||
|
||||
@@ -82,10 +82,10 @@ def main(tmpdir: str) -> xgboost.Booster:
|
||||
missing = np.NaN
|
||||
Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
|
||||
|
||||
# Other tree methods including ``approx``, and ``gpu_hist`` are supported. GPU
|
||||
# behaves differently than CPU tree methods. See tutorial in doc for details.
|
||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in
|
||||
# doc for details.
|
||||
booster = xgboost.train(
|
||||
{"tree_method": "hist", "max_depth": 4},
|
||||
{"tree_method": "approx", "max_depth": 2},
|
||||
Xy,
|
||||
evals=[(Xy, "Train")],
|
||||
num_boost_round=10,
|
||||
|
||||
@@ -259,7 +259,7 @@ There are several ways to build and install the package from source:
|
||||
|
||||
import sys
|
||||
import pathlib
|
||||
libpath = pathlib.Path(sys.base_prefix).joinpath("lib", "libxgboost.so")
|
||||
libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
|
||||
assert libpath.exists()
|
||||
|
||||
Then pass ``use_system_libxgboost=True`` option to ``pip install``:
|
||||
|
||||
@@ -33,8 +33,6 @@ DMatrix
|
||||
.. doxygengroup:: DMatrix
|
||||
:project: xgboost
|
||||
|
||||
.. _c_streaming:
|
||||
|
||||
Streaming
|
||||
---------
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
|
||||
model.set_param({"predictor": "gpu_predictor"})
|
||||
shap_values = model.predict(dtrain, pred_contribs=True)
|
||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||
|
||||
|
||||
@@ -199,6 +199,18 @@ Parameters for Tree Booster
|
||||
- Maximum number of discrete bins to bucket continuous features.
|
||||
- Increasing this number improves the optimality of splits at the cost of higher computation time.
|
||||
|
||||
* ``predictor``, [default= ``auto``]
|
||||
|
||||
- The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
|
||||
|
||||
- ``auto``: Configure predictor based on heuristics.
|
||||
- ``cpu_predictor``: Multicore CPU prediction algorithm.
|
||||
- ``gpu_predictor``: Prediction using GPU. Used when ``tree_method`` is ``gpu_hist``.
|
||||
When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is
|
||||
able to provide GPU based prediction without copying training data to GPU memory.
|
||||
If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only
|
||||
recommended for performing prediction tasks.
|
||||
|
||||
* ``num_parallel_tree``, [default=1]
|
||||
|
||||
- Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
|
||||
|
||||
@@ -87,6 +87,15 @@ with the native Python interface :py:meth:`xgboost.Booster.predict` and
|
||||
behavior. Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
|
||||
might be useful.
|
||||
|
||||
*********
|
||||
Predictor
|
||||
*********
|
||||
|
||||
There are 2 predictors in XGBoost (3 if you have the one-api plugin enabled), namely
|
||||
``cpu_predictor`` and ``gpu_predictor``. The default option is ``auto`` so that XGBoost
|
||||
can employ some heuristics for saving GPU memory during training. They might have slight
|
||||
different outputs due to floating point errors.
|
||||
|
||||
|
||||
***********
|
||||
Base Margin
|
||||
@@ -125,6 +134,15 @@ it. Be aware that the output of in-place prediction depends on input data type,
|
||||
input is on GPU data output is :py:obj:`cupy.ndarray`, otherwise a :py:obj:`numpy.ndarray`
|
||||
is returned.
|
||||
|
||||
****************
|
||||
Categorical Data
|
||||
****************
|
||||
|
||||
Other than users performing encoding, XGBoost has experimental support for categorical
|
||||
data using ``gpu_hist`` and ``gpu_predictor``. No special operation needs to be done on
|
||||
input test data since the information about categories is encoded into the model during
|
||||
training.
|
||||
|
||||
*************
|
||||
Thread Safety
|
||||
*************
|
||||
@@ -141,6 +159,7 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi
|
||||
|
||||
def predict_fn(clf: xgb.XGBClassifier, X):
|
||||
X = preprocess(X)
|
||||
clf.set_params(predictor="gpu_predictor") # NOT safe!
|
||||
clf.set_params(n_jobs=1) # NOT safe!
|
||||
return clf.predict_proba(X, iteration_range=(0, 10))
|
||||
|
||||
|
||||
@@ -54,9 +54,6 @@ on a dask cluster:
|
||||
y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
|
||||
|
||||
dtrain = xgb.dask.DaskDMatrix(client, X, y)
|
||||
# or
|
||||
# dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
|
||||
# `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.
|
||||
|
||||
output = xgb.dask.train(
|
||||
client,
|
||||
@@ -148,8 +145,8 @@ Also for inplace prediction:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# where X is a dask DataFrame or dask Array backed by cupy or cuDF.
|
||||
booster.set_param({"gpu_id": "0"})
|
||||
booster.set_param({'predictor': 'gpu_predictor'})
|
||||
# where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
|
||||
prediction = xgb.dask.inplace_predict(client, booster, X)
|
||||
|
||||
When input is ``da.Array`` object, output is always ``da.Array``. However, if the input
|
||||
|
||||
@@ -2,34 +2,11 @@
|
||||
Using XGBoost External Memory Version
|
||||
#####################################
|
||||
|
||||
When working with large datasets, training XGBoost models can be challenging as the entire
|
||||
dataset needs to be loaded into memory. This can be costly and sometimes
|
||||
infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
|
||||
for running XGBoost algorithms. External memory can be used for both training and
|
||||
prediction, but training is the primary use case and it will be our focus in this
|
||||
tutorial. For prediction and evaluation, users can iterate through the data themseleves
|
||||
while training requires the full dataset to be loaded into the memory.
|
||||
|
||||
During training, there are two different modes for external memory support available in
|
||||
XGBoost, one for CPU-based algorithms like ``hist`` and ``approx``, another one for the
|
||||
GPU-based training algorithm. We will introduce them in the following sections.
|
||||
|
||||
.. note::
|
||||
|
||||
Training on data from external memory is not supported by the ``exact`` tree method.
|
||||
|
||||
.. note::
|
||||
|
||||
The feature is still experimental as of 2.0. The performance is not well optimized.
|
||||
|
||||
The external memory support has gone through multiple iterations and is still under heavy
|
||||
development. Like the :py:class:`~xgboost.QuantileDMatrix` with
|
||||
:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
|
||||
supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
|
||||
memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
|
||||
more details follow). Instead, it will cache all batches on the external memory and fetch
|
||||
them on-demand. Go to the end of the document to see a comparison between
|
||||
`QuantileDMatrix` and external memory.
|
||||
XGBoost supports loading data from external memory using builtin data parser. And
|
||||
starting from version 1.5, users can also define a custom iterator to load data in chunks.
|
||||
The feature is still experimental and not yet ready for production use. In this tutorial
|
||||
we will introduce both methods. Please note that training on data from external memory is
|
||||
not supported by ``exact`` tree method.
|
||||
|
||||
*************
|
||||
Data Iterator
|
||||
@@ -38,8 +15,8 @@ Data Iterator
|
||||
Starting from XGBoost 1.5, users can define their own data loader using Python or C
|
||||
interface. There are some examples in the ``demo`` directory for quick start. This is a
|
||||
generalized version of text input external memory, where users no longer need to prepare a
|
||||
text file that XGBoost recognizes. To enable the feature, users need to define a data
|
||||
iterator with 2 class methods: ``next`` and ``reset``, then pass it into the ``DMatrix``
|
||||
text file that XGBoost recognizes. To enable the feature, user need to define a data
|
||||
iterator with 2 class methods ``next`` and ``reset`` then pass it into ``DMatrix``
|
||||
constructor.
|
||||
|
||||
.. code-block:: python
|
||||
@@ -83,146 +60,20 @@ constructor.
|
||||
|
||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
|
||||
# as noted in following sections.
|
||||
booster = xgboost.train({"tree_method": "hist"}, Xy)
|
||||
booster = xgboost.train({"tree_method": "approx"}, Xy)
|
||||
|
||||
|
||||
The above snippet is a simplified version of :ref:`sphx_glr_python_examples_external_memory.py`.
|
||||
For an example in C, please see ``demo/c-api/external-memory/``. The iterator is the
|
||||
common interface for using external memory with XGBoost, you can pass the resulting
|
||||
``DMatrix`` object for training, prediction, and evaluation.
|
||||
|
||||
It is important to set the batch size based on the memory available. A good starting point
|
||||
is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
|
||||
recommended to set small batch sizes like 32 samples per batch, as this can seriously hurt
|
||||
performance in gradient boosting.
|
||||
|
||||
***********
|
||||
CPU Version
|
||||
***********
|
||||
|
||||
In the previous section, we demonstrated how to train a tree-based model using the
|
||||
``hist`` tree method on a CPU. This method involves iterating through data batches stored
|
||||
in a cache during tree construction. For optimal performance, we recommend using the
|
||||
``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer of tree
|
||||
nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
|
||||
requires XGBoost to iterate over the data set for each tree node, resulting in slower
|
||||
performance.
|
||||
|
||||
If external memory is used, the performance of CPU training is limited by IO
|
||||
(input/output) speed. This means that the disk IO speed primarily determines the training
|
||||
speed. During benchmarking, we used an NVMe connected to a PCIe-4 slot, other types of
|
||||
storage can be too slow for practical usage. In addition, your system may perform caching
|
||||
to reduce the overhead of file reading.
|
||||
|
||||
**********************************
|
||||
GPU Version (GPU Hist tree method)
|
||||
**********************************
|
||||
|
||||
External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
|
||||
``gpu_hist``). However, the algorithm used for GPU is different from the one used for
|
||||
CPU. When training on a CPU, the tree method iterates through all batches from external
|
||||
memory for each step of the tree construction algorithm. On the other hand, the GPU
|
||||
algorithm uses a hybrid approach. It iterates through the data during the beginning of
|
||||
each iteration and concatenates all batches into one in GPU memory. To reduce overall
|
||||
memory usage, users can utilize subsampling. The GPU hist tree method supports
|
||||
`gradient-based sampling`, enabling users to set a low sampling rate without compromising
|
||||
accuracy.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
param = {
|
||||
...
|
||||
'subsample': 0.2,
|
||||
'sampling_method': 'gradient_based',
|
||||
}
|
||||
|
||||
For more information about the sampling algorithm and its use in external memory training,
|
||||
see `this paper <https://arxiv.org/abs/2005.09148>`_.
|
||||
|
||||
.. warning::
|
||||
|
||||
When GPU is running out of memory during iteration on external memory, user might
|
||||
recieve a segfault instead of an OOM exception.
|
||||
|
||||
.. _ext_remarks:
|
||||
|
||||
*******
|
||||
Remarks
|
||||
*******
|
||||
|
||||
When using external memory with XBGoost, data is divided into smaller chunks so that only
|
||||
a fraction of it needs to be stored in memory at any given time. It's important to note
|
||||
that this method only applies to the predictor data (``X``), while other data, like labels
|
||||
and internal runtime structures are concatenated. This means that memory reduction is most
|
||||
effective when dealing with wide datasets where ``X`` is significantly larger in size
|
||||
compared to other data like ``y``, while it has little impact on slim datasets.
|
||||
|
||||
As one might expect, fetching data on-demand puts significant pressure on the storage
|
||||
device. Today's computing device can process way more data than a storage can read in a
|
||||
single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
|
||||
hundred of Gigabytes of floating-point data in a split second. On the other hand, a
|
||||
four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
|
||||
rate. As a result, the training is likely to be severely bounded by your storage
|
||||
device. Before adopting the external memory solution, some back-of-envelop calculations
|
||||
might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
|
||||
(a fairly practical number) of data per second and you have a 100GB of data in compressed
|
||||
XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
|
||||
give or take). A tree with depth 8 needs at least 16 iterations through the data when the
|
||||
parameter is right. You need about 14 minutes to train a single tree without accounting
|
||||
for some other overheads and assume the computation overlaps with the IO. If your dataset
|
||||
happens to have TB-level size, then you might need thousands of trees to get a generalized
|
||||
model. These calculations can help you get an estimate on the expected training time.
|
||||
|
||||
However, sometimes we can ameliorate this limitation. One should also consider that the OS
|
||||
(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
|
||||
evicts pages when new data comes in and there's no room left. In practice, at least some
|
||||
portion of the data can persist on the host memory throughout the entire training
|
||||
session. We are aware of this cache when optimizing the external memory fetcher. The
|
||||
compressed cache is usually smaller than the raw input data, especially when the input is
|
||||
dense without any missing value. If the host memory can fit a significant portion of this
|
||||
compressed cache, then the performance should be decent after initialization. Our
|
||||
development so far focus on two fronts of optimization for external memory:
|
||||
|
||||
- Avoid iterating through the data whenever appropriate.
|
||||
- If the OS can cache the data, the performance should be close to in-core training.
|
||||
|
||||
Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
|
||||
tested against system errors like disconnected network devices (`SIGBUS`). In the face of
|
||||
a bus error, you will see a hard crash and need to clean up the cache files. If the
|
||||
training session might take a long time and you are using solutions like NVMe-oF, we
|
||||
recommend checkpointing your model periodically. Also, it's worth noting that most tests
|
||||
have been conducted on Linux distributions.
|
||||
|
||||
|
||||
Another important point to keep in mind is that creating the initial cache for XGBoost may
|
||||
take some time. The interface to external memory is through custom iterators, which we can
|
||||
not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
|
||||
the `xgboost.config_context` with `verbosity=2` can give you some information on what
|
||||
XGBoost is doing during the wait if you don't mind the extra output.
|
||||
|
||||
*******************************
|
||||
Compared to the QuantileDMatrix
|
||||
*******************************
|
||||
|
||||
Passing an iterator to the :py:class:`~xgboost.QuantileDmatrix` enables direct
|
||||
construction of `QuantileDmatrix` with data chunks. On the other hand, if it's passed to
|
||||
:py:class:`~xgboost.DMatrix`, it instead enables the external memory feature. The
|
||||
:py:class:`~xgboost.QuantileDmatrix` concatenates the data on memory after compression and
|
||||
doesn't fetch data during training. On the other hand, the external memory `DMatrix`
|
||||
fetches data batches from external memory on-demand. Use the `QuantileDMatrix` (with
|
||||
iterator if necessary) when you can fit most of your data in memory. The training would be
|
||||
an order of magnitute faster than using external memory.
|
||||
The above snippet is a simplified version of ``demo/guide-python/external_memory.py``. For
|
||||
an example in C, please see ``demo/c-api/external-memory/``.
|
||||
|
||||
****************
|
||||
Text File Inputs
|
||||
****************
|
||||
|
||||
This is the original form of external memory support, users are encouraged to use custom
|
||||
data iterator instead. There is no big difference between using external memory version of
|
||||
text input and the in-memory version. The only difference is the filename format.
|
||||
There is no big difference between using external memory version and in-memory version.
|
||||
The only difference is the filename format.
|
||||
|
||||
The external memory version takes in the following `URI
|
||||
<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
|
||||
The external memory version takes in the following `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
@@ -240,8 +91,9 @@ To load from csv files, use the following syntax:
|
||||
|
||||
where ``label_column`` should point to the csv column acting as the label.
|
||||
|
||||
If you have a dataset stored in a file similar to ``demo/data/agaricus.txt.train`` with LIBSVM
|
||||
format, the external memory support can be enabled by:
|
||||
To provide a simple example for illustration, extracting the code from
|
||||
`demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_. If
|
||||
you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSVM format, the external memory support can be enabled by:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -252,3 +104,35 @@ XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to
|
||||
more notes about text input formats, see :doc:`/tutorials/input_format`.
|
||||
|
||||
For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
|
||||
|
||||
|
||||
**********************************
|
||||
GPU Version (GPU Hist tree method)
|
||||
**********************************
|
||||
External memory is supported in GPU algorithms (i.e. when ``tree_method`` is set to ``gpu_hist``).
|
||||
|
||||
If you are still getting out-of-memory errors after enabling external memory, try subsampling the
|
||||
data to further reduce GPU memory usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
param = {
|
||||
...
|
||||
'subsample': 0.1,
|
||||
'sampling_method': 'gradient_based',
|
||||
}
|
||||
|
||||
For more information, see `this paper <https://arxiv.org/abs/2005.09148>`_. Internally
|
||||
the tree method still concatenate all the chunks into 1 final histogram index due to
|
||||
performance reason, but in compressed format. So its scalability has an upper bound but
|
||||
still has lower memory cost in general.
|
||||
|
||||
***********
|
||||
CPU Version
|
||||
***********
|
||||
|
||||
For CPU histogram based tree methods (``approx``, ``hist``) it's recommended to use
|
||||
``grow_policy=depthwise`` for performance reason. Iterating over data batches is slow,
|
||||
with ``depthwise`` policy XGBoost can build a entire layer of tree nodes with a few
|
||||
iterations, while with ``lossguide`` XGBoost needs to iterate over the data set for each
|
||||
tree node.
|
||||
|
||||
@@ -11,22 +11,22 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
|
||||
|
||||
model
|
||||
saving_model
|
||||
learning_to_rank
|
||||
dart
|
||||
monotonic
|
||||
feature_interaction_constraint
|
||||
aft_survival_analysis
|
||||
categorical
|
||||
multioutput
|
||||
rf
|
||||
kubernetes
|
||||
Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
|
||||
Distributed XGBoost with XGBoost4J-Spark-GPU <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_gpu_tutorial.html>
|
||||
dask
|
||||
spark_estimator
|
||||
ray
|
||||
external_memory
|
||||
dart
|
||||
monotonic
|
||||
rf
|
||||
feature_interaction_constraint
|
||||
learning_to_rank
|
||||
aft_survival_analysis
|
||||
c_api_tutorial
|
||||
input_format
|
||||
param_tuning
|
||||
external_memory
|
||||
custom_metric_obj
|
||||
categorical
|
||||
multioutput
|
||||
|
||||
@@ -48,9 +48,8 @@ Notice that the samples are sorted based on their query index in a non-decreasin
|
||||
import xgboost as xgb
|
||||
|
||||
# Make a synthetic ranking dataset for demonstration
|
||||
seed = 1994
|
||||
X, y = make_classification(random_state=seed)
|
||||
rng = np.random.default_rng(seed)
|
||||
X, y = make_classification(random_state=rng)
|
||||
rng = np.random.default_rng(1994)
|
||||
n_query_groups = 3
|
||||
qid = rng.integers(0, 3, size=X.shape[0])
|
||||
|
||||
|
||||
@@ -58,45 +58,3 @@ This can affect the training of XGBoost model, and there are two ways to improve
|
||||
|
||||
- In such a case, you cannot re-balance the dataset
|
||||
- Set parameter ``max_delta_step`` to a finite number (say 1) to help convergence
|
||||
|
||||
|
||||
*********************
|
||||
Reducing Memory Usage
|
||||
*********************
|
||||
|
||||
If you are using a HPO library like :py:class:`sklearn.model_selection.GridSearchCV`,
|
||||
please control the number of threads it can use. It's best to let XGBoost to run in
|
||||
parallel instead of asking `GridSearchCV` to run multiple experiments at the same
|
||||
time. For instance, creating a fold of data for cross validation can consume a significant
|
||||
amount of memory:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# This creates a copy of dataset. X and X_train are both in memory at the same time.
|
||||
|
||||
# This happens for every thread at the same time if you run `GridSearchCV` with
|
||||
# `n_jobs` larger than 1
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
df = pd.DataFrame()
|
||||
# This creates a new copy of the dataframe, even if you specify the inplace parameter
|
||||
new_df = df.drop(...)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
array = np.array(...)
|
||||
# This may or may not make a copy of the data, depending on the type of the data
|
||||
array.astype(np.float32)
|
||||
|
||||
.. code-block::
|
||||
|
||||
# np by default uses double, do you actually need it?
|
||||
array = np.array(...)
|
||||
|
||||
You can find some more specific memory reduction practices scattered through the documents
|
||||
For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`. However, before going into
|
||||
these, being conscious about making data copies is a good starting point. It usually
|
||||
consumes a lot more memory than people expect.
|
||||
|
||||
@@ -173,6 +173,7 @@ Will print out something similar to (not actual output as it's too long for demo
|
||||
"gradient_booster": {
|
||||
"gbtree_train_param": {
|
||||
"num_parallel_tree": "1",
|
||||
"predictor": "gpu_predictor",
|
||||
"process_type": "default",
|
||||
"tree_method": "gpu_hist",
|
||||
"updater": "grow_gpu_hist",
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
#include <dmlc/omp.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
@@ -113,7 +112,7 @@ using bst_row_t = std::size_t; // NOLINT
|
||||
/*! \brief Type for tree node index. */
|
||||
using bst_node_t = std::int32_t; // NOLINT
|
||||
/*! \brief Type for ranking group index. */
|
||||
using bst_group_t = std::uint32_t; // NOLINT
|
||||
using bst_group_t = std::uint32_t; // NOLINT
|
||||
/**
|
||||
* \brief Type for indexing into output targets.
|
||||
*/
|
||||
@@ -126,10 +125,6 @@ using bst_layer_t = std::int32_t; // NOLINT
|
||||
* \brief Type for indexing trees.
|
||||
*/
|
||||
using bst_tree_t = std::int32_t; // NOLINT
|
||||
/**
|
||||
* @brief Ordinal of a CUDA device.
|
||||
*/
|
||||
using bst_d_ordinal_t = std::int16_t; // NOLINT
|
||||
|
||||
namespace detail {
|
||||
/*! \brief Implementation of gradient statistics pair. Template specialisation
|
||||
|
||||
@@ -1067,9 +1067,6 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
|
||||
/**
|
||||
* \brief Inplace prediction from CPU dense matrix.
|
||||
*
|
||||
* \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
|
||||
* prediction with DMatrix with a performance warning.
|
||||
*
|
||||
* \param handle Booster handle.
|
||||
* \param values JSON encoded __array_interface__ to values.
|
||||
* \param config See \ref XGBoosterPredictFromDMatrix for more info.
|
||||
@@ -1094,9 +1091,6 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
|
||||
/**
|
||||
* \brief Inplace prediction from CPU CSR matrix.
|
||||
*
|
||||
* \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
|
||||
* prediction with DMatrix with a performance warning.
|
||||
*
|
||||
* \param handle Booster handle.
|
||||
* \param indptr JSON encoded __array_interface__ to row pointer in CSR.
|
||||
* \param indices JSON encoded __array_interface__ to column indices in CSR.
|
||||
@@ -1122,9 +1116,6 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
|
||||
/**
|
||||
* \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
|
||||
*
|
||||
* \note If the booster is configured to run on a CPU, XGBoost falls back to run
|
||||
* prediction with DMatrix with a performance warning.
|
||||
*
|
||||
* \param handle Booster handle
|
||||
* \param values JSON encoded __cuda_array_interface__ to values.
|
||||
* \param config See \ref XGBoosterPredictFromDMatrix for more info.
|
||||
@@ -1146,9 +1137,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
|
||||
/**
|
||||
* \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
|
||||
*
|
||||
* \note If the booster is configured to run on a CPU, XGBoost falls back to run
|
||||
* prediction with DMatrix with a performance warning.
|
||||
*
|
||||
* \param handle Booster handle
|
||||
* \param values List of __cuda_array_interface__ for all columns encoded in JSON list.
|
||||
* \param config See \ref XGBoosterPredictFromDMatrix for more info.
|
||||
|
||||
@@ -1,80 +1,20 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2014-2022 by Contributors
|
||||
* \file context.h
|
||||
*/
|
||||
#ifndef XGBOOST_CONTEXT_H_
|
||||
#define XGBOOST_CONTEXT_H_
|
||||
|
||||
#include <xgboost/base.h> // for bst_d_ordinal_t
|
||||
#include <xgboost/logging.h> // for CHECK_GE
|
||||
#include <xgboost/parameter.h> // for XGBoostParameter
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/parameter.h>
|
||||
|
||||
#include <cstdint> // for int16_t, int32_t, int64_t
|
||||
#include <memory> // for shared_ptr
|
||||
#include <string> // for string, to_string
|
||||
#include <type_traits> // for invoke_result_t, is_same_v
|
||||
#include <memory> // std::shared_ptr
|
||||
#include <string>
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
struct CUDAContext;
|
||||
|
||||
/**
|
||||
* @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
|
||||
* viewing types like `linalg::TensorView`.
|
||||
*/
|
||||
struct DeviceOrd {
|
||||
enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
|
||||
// CUDA device ordinal.
|
||||
bst_d_ordinal_t ordinal{-1};
|
||||
|
||||
[[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
|
||||
[[nodiscard]] bool IsCPU() const { return device == kCPU; }
|
||||
|
||||
DeviceOrd() = default;
|
||||
constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
|
||||
|
||||
DeviceOrd(DeviceOrd const& that) = default;
|
||||
DeviceOrd& operator=(DeviceOrd const& that) = default;
|
||||
DeviceOrd(DeviceOrd&& that) = default;
|
||||
DeviceOrd& operator=(DeviceOrd&& that) = default;
|
||||
|
||||
/**
|
||||
* @brief Constructor for CPU.
|
||||
*/
|
||||
[[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
|
||||
/**
|
||||
* @brief Constructor for CUDA device.
|
||||
*
|
||||
* @param ordinal CUDA device ordinal.
|
||||
*/
|
||||
[[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
|
||||
|
||||
[[nodiscard]] bool operator==(DeviceOrd const& that) const {
|
||||
return device == that.device && ordinal == that.ordinal;
|
||||
}
|
||||
[[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
|
||||
/**
|
||||
* @brief Get a string representation of the device and the ordinal.
|
||||
*/
|
||||
[[nodiscard]] std::string Name() const {
|
||||
switch (device) {
|
||||
case DeviceOrd::kCPU:
|
||||
return "CPU";
|
||||
case DeviceOrd::kCUDA:
|
||||
return "CUDA:" + std::to_string(ordinal);
|
||||
default: {
|
||||
LOG(FATAL) << "Unknown device.";
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
|
||||
|
||||
/**
|
||||
* @brief Runtime context for XGBoost. Contains information like threads and device.
|
||||
*/
|
||||
struct Context : public XGBoostParameter<Context> {
|
||||
public:
|
||||
// Constant representing the device ID of CPU.
|
||||
@@ -96,82 +36,33 @@ struct Context : public XGBoostParameter<Context> {
|
||||
// fail when gpu_id is invalid
|
||||
bool fail_on_invalid_gpu_id{false};
|
||||
bool validate_parameters{false};
|
||||
/**
|
||||
* @brief Configure the parameter `gpu_id'.
|
||||
|
||||
/*!
|
||||
* \brief Configure the parameter `gpu_id'.
|
||||
*
|
||||
* @param require_gpu Whether GPU is explicitly required by the user through other
|
||||
* configurations.
|
||||
* \param require_gpu Whether GPU is explicitly required from user.
|
||||
*/
|
||||
void ConfigureGpuId(bool require_gpu);
|
||||
/**
|
||||
* @brief Returns the automatically chosen number of threads based on the `nthread`
|
||||
* parameter and the system settting.
|
||||
/*!
|
||||
* Return automatically chosen threads.
|
||||
*/
|
||||
[[nodiscard]] std::int32_t Threads() const;
|
||||
/**
|
||||
* @brief Is XGBoost running on CPU?
|
||||
*/
|
||||
[[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
|
||||
/**
|
||||
* @brief Is XGBoost running on a CUDA device?
|
||||
*/
|
||||
[[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
|
||||
/**
|
||||
* @brief Get the current device and ordinal.
|
||||
*/
|
||||
[[nodiscard]] DeviceOrd Device() const {
|
||||
return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
|
||||
}
|
||||
/**
|
||||
* @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
|
||||
*/
|
||||
[[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
|
||||
/**
|
||||
* @brief Name of the current device.
|
||||
*/
|
||||
[[nodiscard]] std::string DeviceName() const { return Device().Name(); }
|
||||
/**
|
||||
* @brief Get a CUDA device context for allocator and stream.
|
||||
*/
|
||||
[[nodiscard]] CUDAContext const* CUDACtx() const;
|
||||
/**
|
||||
* @brief Make a CUDA context based on the current context.
|
||||
*
|
||||
* @param ordinal The CUDA device ordinal.
|
||||
*/
|
||||
[[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
|
||||
std::int32_t Threads() const;
|
||||
|
||||
bool IsCPU() const { return gpu_id == kCpuId; }
|
||||
bool IsCUDA() const { return !IsCPU(); }
|
||||
|
||||
CUDAContext const* CUDACtx() const;
|
||||
// Make a CUDA context based on the current context.
|
||||
Context MakeCUDA(std::int32_t device = 0) const {
|
||||
Context ctx = *this;
|
||||
CHECK_GE(ordinal, 0);
|
||||
ctx.gpu_id = ordinal;
|
||||
ctx.gpu_id = device;
|
||||
return ctx;
|
||||
}
|
||||
/**
|
||||
* @brief Make a CPU context based on the current context.
|
||||
*/
|
||||
[[nodiscard]] Context MakeCPU() const {
|
||||
Context MakeCPU() const {
|
||||
Context ctx = *this;
|
||||
ctx.gpu_id = kCpuId;
|
||||
return ctx;
|
||||
}
|
||||
/**
|
||||
* @brief Call function based on the current device.
|
||||
*/
|
||||
template <typename CPUFn, typename CUDAFn>
|
||||
decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
|
||||
static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
|
||||
switch (this->Device().device) {
|
||||
case DeviceOrd::kCPU:
|
||||
return cpu_fn();
|
||||
case DeviceOrd::kCUDA:
|
||||
return cuda_fn();
|
||||
default:
|
||||
// Do not use the device name as this is likely an internal error, the name
|
||||
// wouldn't be valid.
|
||||
LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
|
||||
break;
|
||||
}
|
||||
return std::invoke_result_t<CPUFn>();
|
||||
}
|
||||
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(Context) {
|
||||
@@ -196,9 +87,9 @@ struct Context : public XGBoostParameter<Context> {
|
||||
}
|
||||
|
||||
private:
|
||||
// mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
|
||||
// shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
|
||||
// p_impl while trying to hide CUDA code from the host compiler.
|
||||
// mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
|
||||
// shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
|
||||
// while trying to hide CUDA code from host compiler.
|
||||
mutable std::shared_ptr<CUDAContext> cuctx_;
|
||||
// cached value for CFS CPU limit. (used in containerized env)
|
||||
std::int32_t cfs_cpu_count_; // NOLINT
|
||||
|
||||
@@ -149,14 +149,18 @@ class GradientBooster : public Model, public Configurable {
|
||||
* \param layer_begin Beginning of boosted tree layer used for prediction.
|
||||
* \param layer_end End of booster layer. 0 means do not limit trees.
|
||||
* \param approximate use a faster (inconsistent) approximation of SHAP values
|
||||
* \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
|
||||
* \param condition_feature feature to condition on (i.e. fix) during calculations
|
||||
*/
|
||||
virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate = false) = 0;
|
||||
virtual void PredictContribution(DMatrix* dmat,
|
||||
HostDeviceVector<bst_float>* out_contribs,
|
||||
unsigned layer_begin, unsigned layer_end,
|
||||
bool approximate = false, int condition = 0,
|
||||
unsigned condition_feature = 0) = 0;
|
||||
|
||||
virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate) = 0;
|
||||
virtual void PredictInteractionContributions(
|
||||
DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
|
||||
unsigned layer_begin, unsigned layer_end, bool approximate) = 0;
|
||||
|
||||
/*!
|
||||
* \brief dump the model in the requested format
|
||||
|
||||
@@ -6,22 +6,24 @@
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/cache.h> // for DMatrixCache
|
||||
#include <xgboost/context.h> // for Context
|
||||
#include <xgboost/cache.h> // DMatrixCache
|
||||
#include <xgboost/context.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/host_device_vector.h>
|
||||
|
||||
#include <functional> // for function
|
||||
#include <memory> // for shared_ptr
|
||||
#include <functional> // std::function
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread> // for get_id
|
||||
#include <utility> // for make_pair
|
||||
#include <vector>
|
||||
|
||||
// Forward declarations
|
||||
namespace xgboost::gbm {
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
struct GBTreeModel;
|
||||
} // namespace xgboost::gbm
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
/**
|
||||
|
||||
@@ -47,7 +47,7 @@
|
||||
<spark.rapids.version>23.04.1</spark.rapids.version>
|
||||
<cudf.classifier>cuda11</cudf.classifier>
|
||||
<scalatest.version>3.2.16</scalatest.version>
|
||||
<scala-collection-compat.version>2.10.0</scala-collection-compat.version>
|
||||
<scala-collection-compat.version>2.11.0</scala-collection-compat.version>
|
||||
</properties>
|
||||
<repositories>
|
||||
<repository>
|
||||
@@ -91,9 +91,6 @@
|
||||
<value>ON</value>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<use.cuda>ON</use.cuda>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>xgboost4j-gpu</module>
|
||||
<module>xgboost4j-spark-gpu</module>
|
||||
|
||||
@@ -78,6 +78,7 @@ public class BoosterTest {
|
||||
put("num_round", round);
|
||||
put("num_workers", 1);
|
||||
put("tree_method", "gpu_hist");
|
||||
put("predictor", "gpu_predictor");
|
||||
put("max_bin", maxBin);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -281,6 +281,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
|
||||
// - predictor: Force to gpu predictor since native doesn't save predictor.
|
||||
val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
|
||||
booster.setParam("gpu_id", gpuId.toString)
|
||||
booster.setParam("predictor", "gpu_predictor")
|
||||
logger.info("GPU transform on device: " + gpuId)
|
||||
boosterFlag.isGpuParamsSet = true;
|
||||
}
|
||||
|
||||
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(
|
||||
|
||||
if build_config.use_system_libxgboost:
|
||||
# Find libxgboost from system prefix
|
||||
sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
|
||||
libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
|
||||
sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
|
||||
libxgboost_sys = sys_prefix / "lib" / _lib_name()
|
||||
if not libxgboost_sys.exists():
|
||||
raise RuntimeError(
|
||||
f"use_system_libxgboost was specified but {_lib_name()} is "
|
||||
|
||||
@@ -2187,25 +2187,20 @@ class Booster:
|
||||
base_margin: Any = None,
|
||||
strict_shape: bool = False,
|
||||
) -> NumpyOrCupy:
|
||||
"""Run prediction in-place when possible, Unlike :py:meth:`predict` method,
|
||||
inplace prediction does not cache the prediction result.
|
||||
"""Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
|
||||
does not cache the prediction result.
|
||||
|
||||
Calling only ``inplace_predict`` in multiple threads is safe and lock
|
||||
free. But the safety does not hold when used in conjunction with other
|
||||
methods. E.g. you can't train the booster in one thread and perform
|
||||
prediction in the other.
|
||||
|
||||
.. note::
|
||||
|
||||
If the device ordinal of the input data doesn't match the one configured for
|
||||
the booster, data will be copied to the booster device.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
|
||||
booster.set_param({"predictor": "gpu_predictor"})
|
||||
booster.inplace_predict(cupy_array)
|
||||
|
||||
booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
|
||||
booster.set_param({"predictor": "cpu_predictor"})
|
||||
booster.inplace_predict(numpy_array)
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
@@ -2213,7 +2208,9 @@ class Booster:
|
||||
Parameters
|
||||
----------
|
||||
data :
|
||||
The input data.
|
||||
The input data, must not be a view for numpy array. Set
|
||||
``predictor`` to ``gpu_predictor`` for running prediction on CuPy
|
||||
array or CuDF DataFrame.
|
||||
iteration_range :
|
||||
See :py:meth:`predict` for details.
|
||||
predict_type :
|
||||
|
||||
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
|
||||
os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
|
||||
# use libxgboost from a system prefix, if available. This should be the last
|
||||
# option.
|
||||
os.path.join(sys.base_prefix, "lib"),
|
||||
os.path.join(sys.prefix, "lib"),
|
||||
]
|
||||
|
||||
if sys.platform == "win32":
|
||||
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
|
||||
+ ("\n- ".join(dll_path))
|
||||
+ "\nXGBoost Python package path: "
|
||||
+ curr_path
|
||||
+ "\nsys.base_prefix: "
|
||||
+ sys.base_prefix
|
||||
+ "\nsys.prefix: "
|
||||
+ sys.prefix
|
||||
+ "\nSee: "
|
||||
+ link
|
||||
+ " for installing XGBoost."
|
||||
|
||||
@@ -277,6 +277,9 @@ __model_doc = f"""
|
||||
Device ordinal.
|
||||
validate_parameters : Optional[bool]
|
||||
Give warnings for unknown parameter.
|
||||
predictor : Optional[str]
|
||||
Force XGBoost to use specific predictor, available choices are [cpu_predictor,
|
||||
gpu_predictor].
|
||||
enable_categorical : bool
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
@@ -378,21 +381,17 @@ __model_doc = f"""
|
||||
every **early_stopping_rounds** round(s) to continue training. Requires at
|
||||
least one item in **eval_set** in :py:meth:`fit`.
|
||||
|
||||
- If early stopping occurs, the model will have two additional attributes:
|
||||
:py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
|
||||
:py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
|
||||
number of trees during inference. If users want to access the full model
|
||||
(including trees built after early stopping), they can specify the
|
||||
`iteration_range` in these inference methods. In addition, other utilities
|
||||
like model plotting can also use the entire model.
|
||||
|
||||
- If you prefer to discard the trees after `best_iteration`, consider using the
|
||||
callback function :py:class:`xgboost.callback.EarlyStopping`.
|
||||
- The method returns the model from the last iteration, not the best one, use a
|
||||
callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
|
||||
model is preferred.
|
||||
|
||||
- If there's more than one item in **eval_set**, the last entry will be used for
|
||||
early stopping. If there's more than one metric in **eval_metric**, the last
|
||||
metric will be used for early stopping.
|
||||
|
||||
- If early stopping occurs, the model will have three additional fields:
|
||||
:py:attr:`best_score`, :py:attr:`best_iteration`.
|
||||
|
||||
.. note::
|
||||
|
||||
This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
|
||||
@@ -649,6 +648,7 @@ class XGBModel(XGBModelBase):
|
||||
importance_type: Optional[str] = None,
|
||||
gpu_id: Optional[int] = None,
|
||||
validate_parameters: Optional[bool] = None,
|
||||
predictor: Optional[str] = None,
|
||||
enable_categorical: bool = False,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
max_cat_to_onehot: Optional[int] = None,
|
||||
@@ -695,6 +695,7 @@ class XGBModel(XGBModelBase):
|
||||
self.importance_type = importance_type
|
||||
self.gpu_id = gpu_id
|
||||
self.validate_parameters = validate_parameters
|
||||
self.predictor = predictor
|
||||
self.enable_categorical = enable_categorical
|
||||
self.feature_types = feature_types
|
||||
self.max_cat_to_onehot = max_cat_to_onehot
|
||||
@@ -930,7 +931,8 @@ class XGBModel(XGBModelBase):
|
||||
callbacks = self.callbacks if self.callbacks is not None else callbacks
|
||||
|
||||
tree_method = params.get("tree_method", None)
|
||||
if self.enable_categorical and tree_method == "exact":
|
||||
cat_support = {"gpu_hist", "approx", "hist"}
|
||||
if self.enable_categorical and tree_method not in cat_support:
|
||||
raise ValueError(
|
||||
"Experimental support for categorical data is not implemented for"
|
||||
" current tree method yet."
|
||||
@@ -1087,7 +1089,12 @@ class XGBModel(XGBModelBase):
|
||||
return self
|
||||
|
||||
def _can_use_inplace_predict(self) -> bool:
|
||||
if self.booster != "gblinear":
|
||||
# When predictor is explicitly set, using `inplace_predict` might result into
|
||||
# error with incompatible data type.
|
||||
# Inplace predict doesn't handle as many data types as DMatrix, but it's
|
||||
# sufficient for dask interface where input is simpiler.
|
||||
predictor = self.get_xgb_params().get("predictor", None)
|
||||
if predictor in ("auto", None) and self.booster != "gblinear":
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -1113,9 +1120,9 @@ class XGBModel(XGBModelBase):
|
||||
iteration_range: Optional[Tuple[int, int]] = None,
|
||||
) -> ArrayLike:
|
||||
"""Predict with `X`. If the model is trained with early stopping, then
|
||||
:py:attr:`best_iteration` is used automatically. The estimator uses
|
||||
`inplace_predict` by default and falls back to using :py:class:`DMatrix` if
|
||||
devices between the data and the estimator don't match.
|
||||
:py:attr:`best_iteration` is used automatically. For tree models, when data is
|
||||
on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
|
||||
prediction is run on GPU automatically, otherwise it will run on CPU.
|
||||
|
||||
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
||||
|
||||
@@ -1577,9 +1584,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
|
||||
) -> np.ndarray:
|
||||
"""Predict the probability of each `X` example being of a given class. If the
|
||||
model is trained with early stopping, then :py:attr:`best_iteration` is used
|
||||
automatically. The estimator uses `inplace_predict` by default and falls back to
|
||||
using :py:class:`DMatrix` if devices between the data and the estimator don't
|
||||
match.
|
||||
automatically.
|
||||
|
||||
.. note:: This function is only thread safe for `gbtree` and `dart`.
|
||||
|
||||
|
||||
@@ -25,7 +25,6 @@ from typing import (
|
||||
Set,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
@@ -199,14 +198,14 @@ class IteratorForTest(xgb.core.DataIter):
|
||||
X: Sequence,
|
||||
y: Sequence,
|
||||
w: Optional[Sequence],
|
||||
cache: Optional[str],
|
||||
cache: Optional[str] = "./",
|
||||
) -> None:
|
||||
assert len(X) == len(y)
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.w = w
|
||||
self.it = 0
|
||||
super().__init__(cache_prefix=cache)
|
||||
super().__init__(cache)
|
||||
|
||||
def next(self, input_data: Callable) -> int:
|
||||
if self.it == len(self.X):
|
||||
@@ -348,9 +347,7 @@ class TestDataset:
|
||||
if w is not None:
|
||||
weight.append(w)
|
||||
|
||||
it = IteratorForTest(
|
||||
predictor, response, weight if weight else None, cache="cache"
|
||||
)
|
||||
it = IteratorForTest(predictor, response, weight if weight else None)
|
||||
return xgb.DMatrix(it)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
@@ -712,27 +709,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
|
||||
)
|
||||
|
||||
|
||||
M = TypeVar("M", xgb.Booster, xgb.XGBModel)
|
||||
|
||||
|
||||
def set_ordinal(ordinal: int, booster: M) -> M:
|
||||
"""Temporary solution for setting the device ordinal until we move away from
|
||||
`gpu_id`.
|
||||
|
||||
"""
|
||||
if ordinal < 0:
|
||||
params = {"gpu_id": -1, "tree_method": "hist"}
|
||||
else:
|
||||
params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
|
||||
|
||||
if isinstance(booster, xgb.Booster):
|
||||
booster.set_param(params)
|
||||
elif isinstance(booster, xgb.XGBModel):
|
||||
booster.set_params(**params)
|
||||
|
||||
return booster
|
||||
|
||||
|
||||
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
|
||||
"""Evaluation metric for xgb.train"""
|
||||
label = dtrain.get_label()
|
||||
|
||||
@@ -1,79 +1,73 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright (c) 2014-2019 by Contributors
|
||||
* \file io.h
|
||||
* \brief utilities with different serializable implementations
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef RABIT_INTERNAL_IO_H_
|
||||
#define RABIT_INTERNAL_IO_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdio>
|
||||
#include <cstring> // for memcpy
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#include "rabit/internal/utils.h"
|
||||
#include "rabit/serializable.h"
|
||||
|
||||
namespace rabit::utils {
|
||||
namespace rabit {
|
||||
namespace utils {
|
||||
/*! \brief re-use definition of dmlc::SeekStream */
|
||||
using SeekStream = dmlc::SeekStream;
|
||||
/**
|
||||
* @brief Fixed size memory buffer as a stream.
|
||||
*/
|
||||
/*! \brief fixed size memory buffer */
|
||||
struct MemoryFixSizeBuffer : public SeekStream {
|
||||
public:
|
||||
// similar to SEEK_END in libc
|
||||
static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
|
||||
static size_t constexpr kSeekEnd = std::numeric_limits<size_t>::max();
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Ctor
|
||||
*
|
||||
* @param p_buffer Pointer to the source buffer with size `buffer_size`.
|
||||
* @param buffer_size Size of the source buffer
|
||||
*/
|
||||
MemoryFixSizeBuffer(void *p_buffer, std::size_t buffer_size)
|
||||
: p_buffer_(reinterpret_cast<char *>(p_buffer)), buffer_size_(buffer_size) {}
|
||||
MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
|
||||
: p_buffer_(reinterpret_cast<char*>(p_buffer)),
|
||||
buffer_size_(buffer_size) {
|
||||
curr_ptr_ = 0;
|
||||
}
|
||||
~MemoryFixSizeBuffer() override = default;
|
||||
|
||||
std::size_t Read(void *ptr, std::size_t size) override {
|
||||
std::size_t nread = std::min(buffer_size_ - curr_ptr_, size);
|
||||
size_t Read(void *ptr, size_t size) override {
|
||||
size_t nread = std::min(buffer_size_ - curr_ptr_, size);
|
||||
if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
|
||||
curr_ptr_ += nread;
|
||||
return nread;
|
||||
}
|
||||
void Write(const void *ptr, std::size_t size) override {
|
||||
void Write(const void *ptr, size_t size) override {
|
||||
if (size == 0) return;
|
||||
CHECK_LE(curr_ptr_ + size, buffer_size_);
|
||||
utils::Assert(curr_ptr_ + size <= buffer_size_,
|
||||
"write position exceed fixed buffer size");
|
||||
std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
|
||||
curr_ptr_ += size;
|
||||
}
|
||||
void Seek(std::size_t pos) override {
|
||||
void Seek(size_t pos) override {
|
||||
if (pos == kSeekEnd) {
|
||||
curr_ptr_ = buffer_size_;
|
||||
} else {
|
||||
curr_ptr_ = static_cast<std::size_t>(pos);
|
||||
curr_ptr_ = static_cast<size_t>(pos);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Current position in the buffer (stream).
|
||||
*/
|
||||
std::size_t Tell() override { return curr_ptr_; }
|
||||
[[nodiscard]] virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
|
||||
size_t Tell() override {
|
||||
return curr_ptr_;
|
||||
}
|
||||
virtual bool AtEnd() const {
|
||||
return curr_ptr_ == buffer_size_;
|
||||
}
|
||||
|
||||
protected:
|
||||
private:
|
||||
/*! \brief in memory buffer */
|
||||
char *p_buffer_{nullptr};
|
||||
char *p_buffer_;
|
||||
/*! \brief current pointer */
|
||||
std::size_t buffer_size_{0};
|
||||
size_t buffer_size_;
|
||||
/*! \brief current pointer */
|
||||
std::size_t curr_ptr_{0};
|
||||
};
|
||||
size_t curr_ptr_;
|
||||
}; // class MemoryFixSizeBuffer
|
||||
|
||||
/*! \brief a in memory buffer that can be read and write as stream interface */
|
||||
struct MemoryBufferStream : public SeekStream {
|
||||
@@ -115,5 +109,6 @@ struct MemoryBufferStream : public SeekStream {
|
||||
/*! \brief current pointer */
|
||||
size_t curr_ptr_;
|
||||
}; // class MemoryBufferStream
|
||||
} // namespace rabit::utils
|
||||
} // namespace utils
|
||||
} // namespace rabit
|
||||
#endif // RABIT_INTERNAL_IO_H_
|
||||
|
||||
@@ -1023,6 +1023,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
|
||||
const float **out_result) {
|
||||
xgboost_CHECK_C_ARG_PTR(c_json_config);
|
||||
auto config = Json::Load(StringView{c_json_config});
|
||||
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
|
||||
|
||||
HostDeviceVector<float> *p_predt{nullptr};
|
||||
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
|
||||
@@ -1041,7 +1042,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
|
||||
xgboost_CHECK_C_ARG_PTR(out_dim);
|
||||
CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
|
||||
learner->BoostedRounds(), &shape, out_dim);
|
||||
CHECK_GE(p_predt->Size(), n_samples);
|
||||
|
||||
xgboost_CHECK_C_ARG_PTR(out_result);
|
||||
xgboost_CHECK_C_ARG_PTR(out_shape);
|
||||
|
||||
@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
|
||||
API_END();
|
||||
}
|
||||
|
||||
int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
|
||||
int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
|
||||
char const *c_json_config, std::shared_ptr<DMatrix> p_m,
|
||||
xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
|
||||
const float **out_result) {
|
||||
@@ -107,6 +107,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
|
||||
proxy->SetCUDAArray(c_array_interface);
|
||||
|
||||
auto config = Json::Load(StringView{c_json_config});
|
||||
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
|
||||
auto *learner = static_cast<Learner *>(handle);
|
||||
|
||||
HostDeviceVector<float> *p_predt{nullptr};
|
||||
@@ -117,13 +118,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
|
||||
RequiredArg<Integer>(config, "iteration_begin", __func__),
|
||||
RequiredArg<Integer>(config, "iteration_end", __func__));
|
||||
CHECK(p_predt);
|
||||
if (learner->Ctx()->IsCPU()) {
|
||||
// Prediction using DMatrix as fallback.
|
||||
CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
|
||||
} else {
|
||||
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
|
||||
}
|
||||
p_predt->SetDevice(proxy->DeviceIdx());
|
||||
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
|
||||
|
||||
auto &shape = learner->GetThreadLocal().prediction_shape;
|
||||
size_t n_samples = p_m->Info().num_row_;
|
||||
@@ -151,7 +146,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
|
||||
if (m) {
|
||||
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
|
||||
}
|
||||
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
|
||||
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
|
||||
out_result);
|
||||
}
|
||||
|
||||
@@ -164,6 +159,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
|
||||
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
|
||||
}
|
||||
xgboost_CHECK_C_ARG_PTR(out_result);
|
||||
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
|
||||
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
|
||||
out_result);
|
||||
}
|
||||
|
||||
@@ -29,21 +29,13 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
|
||||
old_device_ordinal = device_ordinal;
|
||||
old_world_size = communicator_->GetWorldSize();
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
switch (type_) {
|
||||
case CommunicatorType::kRabit:
|
||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
|
||||
break;
|
||||
case CommunicatorType::kFederated:
|
||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
||||
break;
|
||||
case CommunicatorType::kInMemory:
|
||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
|
||||
break;
|
||||
default:
|
||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
|
||||
if (type_ != CommunicatorType::kFederated) {
|
||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
|
||||
} else {
|
||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
|
||||
}
|
||||
#else
|
||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
|
||||
#endif
|
||||
}
|
||||
return device_communicator_.get();
|
||||
|
||||
@@ -11,18 +11,21 @@ namespace collective {
|
||||
|
||||
class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
public:
|
||||
explicit DeviceCommunicatorAdapter(int device_ordinal)
|
||||
: device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
|
||||
DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
|
||||
: device_ordinal_{device_ordinal}, communicator_{communicator} {
|
||||
if (device_ordinal_ < 0) {
|
||||
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
|
||||
}
|
||||
if (communicator_ == nullptr) {
|
||||
LOG(FATAL) << "Communicator cannot be null.";
|
||||
}
|
||||
}
|
||||
|
||||
~DeviceCommunicatorAdapter() override = default;
|
||||
|
||||
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
||||
Operation op) override {
|
||||
if (world_size_ == 1) {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -30,34 +33,37 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
auto size = count * GetTypeSize(data_type);
|
||||
host_buffer_.reserve(size);
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
|
||||
Allreduce(host_buffer_.data(), count, data_type, op);
|
||||
communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
|
||||
dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
|
||||
dh::caching_device_vector<char> *receive_buffer) override {
|
||||
if (world_size_ == 1) {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
int const world_size = communicator_->GetWorldSize();
|
||||
int const rank = communicator_->GetRank();
|
||||
|
||||
segments->clear();
|
||||
segments->resize(world_size_, 0);
|
||||
segments->at(rank_) = length_bytes;
|
||||
Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
|
||||
segments->resize(world_size, 0);
|
||||
segments->at(rank) = length_bytes;
|
||||
communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
|
||||
Operation::kMax);
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
|
||||
receive_buffer->resize(total_bytes);
|
||||
|
||||
host_buffer_.reserve(total_bytes);
|
||||
size_t offset = 0;
|
||||
for (int32_t i = 0; i < world_size_; ++i) {
|
||||
for (int32_t i = 0; i < world_size; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
if (i == rank_) {
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
|
||||
if (i == rank) {
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
Broadcast(host_buffer_.data() + offset, as_bytes, i);
|
||||
communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
|
||||
offset += as_bytes;
|
||||
}
|
||||
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
|
||||
@@ -70,8 +76,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
|
||||
private:
|
||||
int const device_ordinal_;
|
||||
int const world_size_;
|
||||
int const rank_;
|
||||
Communicator *communicator_;
|
||||
/// Host buffer used to call communicator functions.
|
||||
std::vector<char> host_buffer_{};
|
||||
};
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2023 XGBoost contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
#include "nccl_device_communicator.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
|
||||
: device_ordinal_{device_ordinal},
|
||||
needs_sync_{needs_sync},
|
||||
world_size_{GetWorldSize()},
|
||||
rank_{GetRank()} {
|
||||
if (device_ordinal_ < 0) {
|
||||
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
|
||||
}
|
||||
if (world_size_ == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
|
||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||
auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
|
||||
GetCudaUUID(s_this_uuid);
|
||||
|
||||
// TODO(rongou): replace this with allgather.
|
||||
Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
|
||||
|
||||
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
|
||||
converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
|
||||
j++;
|
||||
}
|
||||
|
||||
auto iter = std::unique(converted.begin(), converted.end());
|
||||
auto n_uniques = std::distance(converted.begin(), iter);
|
||||
|
||||
CHECK_EQ(n_uniques, world_size_)
|
||||
<< "Multiple processes within communication group running on same CUDA "
|
||||
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
|
||||
|
||||
nccl_unique_id_ = GetUniqueId();
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
|
||||
dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
|
||||
}
|
||||
|
||||
NcclDeviceCommunicator::~NcclDeviceCommunicator() {
|
||||
if (world_size_ == 1) {
|
||||
return;
|
||||
}
|
||||
if (cuda_stream_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
|
||||
}
|
||||
if (nccl_comm_) {
|
||||
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
ncclDataType_t GetNcclDataType(DataType const &data_type) {
|
||||
ncclDataType_t result{ncclInt8};
|
||||
switch (data_type) {
|
||||
case DataType::kInt8:
|
||||
result = ncclInt8;
|
||||
break;
|
||||
case DataType::kUInt8:
|
||||
result = ncclUint8;
|
||||
break;
|
||||
case DataType::kInt32:
|
||||
result = ncclInt32;
|
||||
break;
|
||||
case DataType::kUInt32:
|
||||
result = ncclUint32;
|
||||
break;
|
||||
case DataType::kInt64:
|
||||
result = ncclInt64;
|
||||
break;
|
||||
case DataType::kUInt64:
|
||||
result = ncclUint64;
|
||||
break;
|
||||
case DataType::kFloat:
|
||||
result = ncclFloat;
|
||||
break;
|
||||
case DataType::kDouble:
|
||||
result = ncclDouble;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown data type.";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool IsBitwiseOp(Operation const &op) {
|
||||
return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
|
||||
op == Operation::kBitwiseXOR;
|
||||
}
|
||||
|
||||
ncclRedOp_t GetNcclRedOp(Operation const &op) {
|
||||
ncclRedOp_t result{ncclMax};
|
||||
switch (op) {
|
||||
case Operation::kMax:
|
||||
result = ncclMax;
|
||||
break;
|
||||
case Operation::kMin:
|
||||
result = ncclMin;
|
||||
break;
|
||||
case Operation::kSum:
|
||||
result = ncclSum;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unsupported reduce operation.";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
|
||||
std::size_t size, cudaStream_t stream) {
|
||||
dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
|
||||
auto result = device_buffer[idx];
|
||||
for (auto rank = 1; rank < world_size; rank++) {
|
||||
result = func(result, device_buffer[rank * size + idx]);
|
||||
}
|
||||
out_buffer[idx] = result;
|
||||
});
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
|
||||
DataType data_type, Operation op) {
|
||||
auto const size = count * GetTypeSize(data_type);
|
||||
dh::caching_device_vector<char> buffer(size * world_size_);
|
||||
auto *device_buffer = buffer.data().get();
|
||||
|
||||
// First gather data from all the workers.
|
||||
dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
|
||||
nccl_comm_, cuda_stream_));
|
||||
if (needs_sync_) {
|
||||
dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
|
||||
}
|
||||
|
||||
// Then reduce locally.
|
||||
auto *out_buffer = static_cast<char *>(send_receive_buffer);
|
||||
switch (op) {
|
||||
case Operation::kBitwiseAND:
|
||||
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
|
||||
cuda_stream_);
|
||||
break;
|
||||
case Operation::kBitwiseOR:
|
||||
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
|
||||
cuda_stream_);
|
||||
break;
|
||||
case Operation::kBitwiseXOR:
|
||||
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
|
||||
cuda_stream_);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Not a bitwise reduce operation.";
|
||||
}
|
||||
}
|
||||
|
||||
void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
|
||||
DataType data_type, Operation op) {
|
||||
if (world_size_ == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
if (IsBitwiseOp(op)) {
|
||||
BitwiseAllReduce(send_receive_buffer, count, data_type, op);
|
||||
} else {
|
||||
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
|
||||
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
|
||||
cuda_stream_));
|
||||
}
|
||||
allreduce_bytes_ += count * GetTypeSize(data_type);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
|
||||
std::vector<std::size_t> *segments,
|
||||
dh::caching_device_vector<char> *receive_buffer) {
|
||||
if (world_size_ == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
|
||||
segments->clear();
|
||||
segments->resize(world_size_, 0);
|
||||
segments->at(rank_) = length_bytes;
|
||||
Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
|
||||
receive_buffer->resize(total_bytes);
|
||||
|
||||
size_t offset = 0;
|
||||
dh::safe_nccl(ncclGroupStart());
|
||||
for (int32_t i = 0; i < world_size_; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
|
||||
ncclChar, i, nccl_comm_, cuda_stream_));
|
||||
offset += as_bytes;
|
||||
}
|
||||
dh::safe_nccl(ncclGroupEnd());
|
||||
}
|
||||
|
||||
void NcclDeviceCommunicator::Synchronize() {
|
||||
if (world_size_ == 1) {
|
||||
return;
|
||||
}
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2022-2023 XGBoost contributors
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
@@ -12,26 +12,116 @@ namespace collective {
|
||||
|
||||
class NcclDeviceCommunicator : public DeviceCommunicator {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new NCCL communicator.
|
||||
* @param device_ordinal The GPU device id.
|
||||
* @param needs_sync Whether extra CUDA stream synchronization is needed.
|
||||
*
|
||||
* In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
|
||||
* a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
|
||||
* makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
|
||||
*
|
||||
* The Rabit communicator runs with one process per GPU, so the additional synchronization is not
|
||||
* needed. The in-memory communicator is used in tests with multiple threads, each thread
|
||||
* representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
|
||||
*/
|
||||
explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
|
||||
~NcclDeviceCommunicator() override;
|
||||
NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
|
||||
: device_ordinal_{device_ordinal}, communicator_{communicator} {
|
||||
if (device_ordinal_ < 0) {
|
||||
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
|
||||
}
|
||||
if (communicator_ == nullptr) {
|
||||
LOG(FATAL) << "Communicator cannot be null.";
|
||||
}
|
||||
|
||||
int32_t const rank = communicator_->GetRank();
|
||||
int32_t const world = communicator_->GetWorldSize();
|
||||
|
||||
if (world == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> uuids(world * kUuidLength, 0);
|
||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
|
||||
GetCudaUUID(s_this_uuid);
|
||||
|
||||
// TODO(rongou): replace this with allgather.
|
||||
communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
|
||||
|
||||
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
|
||||
converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
|
||||
j++;
|
||||
}
|
||||
|
||||
auto iter = std::unique(converted.begin(), converted.end());
|
||||
auto n_uniques = std::distance(converted.begin(), iter);
|
||||
|
||||
CHECK_EQ(n_uniques, world)
|
||||
<< "Multiple processes within communication group running on same CUDA "
|
||||
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
|
||||
|
||||
nccl_unique_id_ = GetUniqueId();
|
||||
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
|
||||
dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
|
||||
}
|
||||
|
||||
~NcclDeviceCommunicator() override {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
if (cuda_stream_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
|
||||
}
|
||||
if (nccl_comm_) {
|
||||
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
|
||||
}
|
||||
}
|
||||
|
||||
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
||||
Operation op) override;
|
||||
Operation op) override {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
|
||||
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
|
||||
cuda_stream_));
|
||||
allreduce_bytes_ += count * GetTypeSize(data_type);
|
||||
allreduce_calls_ += 1;
|
||||
}
|
||||
|
||||
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
|
||||
dh::caching_device_vector<char> *receive_buffer) override;
|
||||
void Synchronize() override;
|
||||
dh::caching_device_vector<char> *receive_buffer) override {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
int const world_size = communicator_->GetWorldSize();
|
||||
int const rank = communicator_->GetRank();
|
||||
|
||||
segments->clear();
|
||||
segments->resize(world_size, 0);
|
||||
segments->at(rank) = length_bytes;
|
||||
communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
|
||||
Operation::kMax);
|
||||
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
|
||||
receive_buffer->resize(total_bytes);
|
||||
|
||||
size_t offset = 0;
|
||||
dh::safe_nccl(ncclGroupStart());
|
||||
for (int32_t i = 0; i < world_size; ++i) {
|
||||
size_t as_bytes = segments->at(i);
|
||||
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
|
||||
ncclChar, i, nccl_comm_, cuda_stream_));
|
||||
offset += as_bytes;
|
||||
}
|
||||
dh::safe_nccl(ncclGroupEnd());
|
||||
}
|
||||
|
||||
void Synchronize() override {
|
||||
if (communicator_->GetWorldSize() == 1) {
|
||||
return;
|
||||
}
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr std::size_t kUuidLength =
|
||||
@@ -62,20 +152,71 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
|
||||
ncclUniqueId GetUniqueId() {
|
||||
static const int kRootRank = 0;
|
||||
ncclUniqueId id;
|
||||
if (rank_ == kRootRank) {
|
||||
if (communicator_->GetRank() == kRootRank) {
|
||||
dh::safe_nccl(ncclGetUniqueId(&id));
|
||||
}
|
||||
Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
|
||||
communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
|
||||
static_cast<int>(kRootRank));
|
||||
return id;
|
||||
}
|
||||
|
||||
void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
||||
Operation op);
|
||||
static ncclDataType_t GetNcclDataType(DataType const &data_type) {
|
||||
ncclDataType_t result;
|
||||
switch (data_type) {
|
||||
case DataType::kInt8:
|
||||
result = ncclInt8;
|
||||
break;
|
||||
case DataType::kUInt8:
|
||||
result = ncclUint8;
|
||||
break;
|
||||
case DataType::kInt32:
|
||||
result = ncclInt32;
|
||||
break;
|
||||
case DataType::kUInt32:
|
||||
result = ncclUint32;
|
||||
break;
|
||||
case DataType::kInt64:
|
||||
result = ncclInt64;
|
||||
break;
|
||||
case DataType::kUInt64:
|
||||
result = ncclUint64;
|
||||
break;
|
||||
case DataType::kFloat:
|
||||
result = ncclFloat;
|
||||
break;
|
||||
case DataType::kDouble:
|
||||
result = ncclDouble;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown data type.";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static ncclRedOp_t GetNcclRedOp(Operation const &op) {
|
||||
ncclRedOp_t result;
|
||||
switch (op) {
|
||||
case Operation::kMax:
|
||||
result = ncclMax;
|
||||
break;
|
||||
case Operation::kMin:
|
||||
result = ncclMin;
|
||||
break;
|
||||
case Operation::kSum:
|
||||
result = ncclSum;
|
||||
break;
|
||||
case Operation::kBitwiseAND:
|
||||
case Operation::kBitwiseOR:
|
||||
case Operation::kBitwiseXOR:
|
||||
LOG(FATAL) << "Not implemented yet.";
|
||||
default:
|
||||
LOG(FATAL) << "Unknown reduce operation.";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int const device_ordinal_;
|
||||
bool const needs_sync_;
|
||||
int const world_size_;
|
||||
int const rank_;
|
||||
Communicator *communicator_;
|
||||
ncclComm_t nccl_comm_{};
|
||||
cudaStream_t cuda_stream_{};
|
||||
ncclUniqueId nccl_unique_id_{};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2019 by Contributors
|
||||
* \file bitfield.h
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_BITFIELD_H_
|
||||
@@ -50,17 +50,14 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
|
||||
}
|
||||
#endif // defined(__CUDACC__)
|
||||
|
||||
/**
|
||||
* @brief A non-owning type with auxiliary methods defined for manipulating bits.
|
||||
/*!
|
||||
* \brief A non-owning type with auxiliary methods defined for manipulating bits.
|
||||
*
|
||||
* @tparam VT Underlying value type, must be an unsigned integer.
|
||||
* @tparam Direction Whether the bits start from left or from right.
|
||||
* @tparam IsConst Whether the view is const.
|
||||
* \tparam Direction Whether the bits start from left or from right.
|
||||
*/
|
||||
template <typename VT, typename Direction, bool IsConst = false>
|
||||
struct BitFieldContainer {
|
||||
using value_type = std::conditional_t<IsConst, VT const, VT>; // NOLINT
|
||||
using size_type = size_t; // NOLINT
|
||||
using index_type = size_t; // NOLINT
|
||||
using pointer = value_type*; // NOLINT
|
||||
|
||||
@@ -73,9 +70,8 @@ struct BitFieldContainer {
|
||||
};
|
||||
|
||||
private:
|
||||
value_type* bits_{nullptr};
|
||||
size_type n_values_{0};
|
||||
static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
|
||||
common::Span<value_type> bits_;
|
||||
static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
|
||||
|
||||
public:
|
||||
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
|
||||
@@ -90,15 +86,13 @@ struct BitFieldContainer {
|
||||
|
||||
public:
|
||||
BitFieldContainer() = default;
|
||||
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
|
||||
: bits_{bits.data()}, n_values_{bits.size()} {}
|
||||
BitFieldContainer(BitFieldContainer const& other) = default;
|
||||
BitFieldContainer(BitFieldContainer&& other) = default;
|
||||
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
|
||||
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
|
||||
BitFieldContainer &operator=(BitFieldContainer const &that) = default;
|
||||
BitFieldContainer &operator=(BitFieldContainer &&that) = default;
|
||||
|
||||
XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
|
||||
XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }
|
||||
XGBOOST_DEVICE common::Span<value_type> Bits() { return bits_; }
|
||||
XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }
|
||||
|
||||
/*\brief Compute the size of needed memory allocation. The returned value is in terms
|
||||
* of number of elements with `BitFieldContainer::value_type'.
|
||||
@@ -109,17 +103,17 @@ struct BitFieldContainer {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t min_size = min(NumValues(), rhs.NumValues());
|
||||
size_t min_size = min(bits_.size(), rhs.bits_.size());
|
||||
if (tid < min_size) {
|
||||
Data()[tid] |= rhs.Data()[tid];
|
||||
bits_[tid] |= rhs.bits_[tid];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#else
|
||||
BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
|
||||
size_t min_size = std::min(NumValues(), rhs.NumValues());
|
||||
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
|
||||
for (size_t i = 0; i < min_size; ++i) {
|
||||
Data()[i] |= rhs.Data()[i];
|
||||
bits_[i] |= rhs.bits_[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@@ -127,85 +121,75 @@ struct BitFieldContainer {
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
__device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
|
||||
size_t min_size = min(NumValues(), rhs.NumValues());
|
||||
size_t min_size = min(bits_.size(), rhs.bits_.size());
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < min_size) {
|
||||
Data()[tid] &= rhs.Data()[tid];
|
||||
bits_[tid] &= rhs.bits_[tid];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#else
|
||||
BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
|
||||
size_t min_size = std::min(NumValues(), rhs.NumValues());
|
||||
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
|
||||
for (size_t i = 0; i < min_size; ++i) {
|
||||
Data()[i] &= rhs.Data()[i];
|
||||
bits_[i] &= rhs.bits_[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
__device__ auto Set(index_type pos) noexcept(true) {
|
||||
__device__ auto Set(index_type pos) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
value_type& value = bits_[pos_v.int_pos];
|
||||
value_type set_bit = kOne << pos_v.bit_pos;
|
||||
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
|
||||
atomicOr(reinterpret_cast<Type *>(&value), set_bit);
|
||||
}
|
||||
__device__ void Clear(index_type pos) noexcept(true) {
|
||||
__device__ void Clear(index_type pos) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
value_type& value = bits_[pos_v.int_pos];
|
||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
|
||||
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
|
||||
}
|
||||
#else
|
||||
void Set(index_type pos) noexcept(true) {
|
||||
void Set(index_type pos) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
value_type& value = bits_[pos_v.int_pos];
|
||||
value_type set_bit = kOne << pos_v.bit_pos;
|
||||
value |= set_bit;
|
||||
}
|
||||
void Clear(index_type pos) noexcept(true) {
|
||||
void Clear(index_type pos) {
|
||||
Pos pos_v = Direction::Shift(ToBitPos(pos));
|
||||
value_type& value = Data()[pos_v.int_pos];
|
||||
value_type& value = bits_[pos_v.int_pos];
|
||||
value_type clear_bit = ~(kOne << pos_v.bit_pos);
|
||||
value &= clear_bit;
|
||||
}
|
||||
#endif // defined(__CUDA_ARCH__)
|
||||
|
||||
XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
|
||||
XGBOOST_DEVICE bool Check(Pos pos_v) const {
|
||||
pos_v = Direction::Shift(pos_v);
|
||||
assert(pos_v.int_pos < NumValues());
|
||||
value_type const value = Data()[pos_v.int_pos];
|
||||
SPAN_LT(pos_v.int_pos, bits_.size());
|
||||
value_type const value = bits_[pos_v.int_pos];
|
||||
value_type const test_bit = kOne << pos_v.bit_pos;
|
||||
value_type result = test_bit & value;
|
||||
return static_cast<bool>(result);
|
||||
}
|
||||
[[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
|
||||
XGBOOST_DEVICE bool Check(index_type pos) const {
|
||||
Pos pos_v = ToBitPos(pos);
|
||||
return Check(pos_v);
|
||||
}
|
||||
/**
|
||||
* @brief Returns the total number of bits that can be viewed. This is equal to or
|
||||
* larger than the acutal number of valid bits.
|
||||
*/
|
||||
[[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
|
||||
return kValueSize * NumValues();
|
||||
}
|
||||
/**
|
||||
* @brief Number of storage unit used in this bit field.
|
||||
*/
|
||||
[[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }
|
||||
|
||||
XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }
|
||||
XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }
|
||||
|
||||
inline friend std::ostream& operator<<(std::ostream& os,
|
||||
BitFieldContainer<VT, Direction, IsConst> field) {
|
||||
os << "Bits "
|
||||
<< "storage size: " << field.NumValues() << "\n";
|
||||
for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
|
||||
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
|
||||
XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
|
||||
|
||||
inline friend std::ostream &
|
||||
operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
|
||||
os << "Bits " << "storage size: " << field.bits_.size() << "\n";
|
||||
for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
|
||||
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
|
||||
os << bset << "\n";
|
||||
}
|
||||
return os;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2020-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2020-2022 by XGBoost Contributors
|
||||
* \file categorical.h
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "bitfield.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/parameter.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -1,27 +1,16 @@
|
||||
/**
|
||||
* Copyright 2017-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2017-2022 by XGBoost Contributors
|
||||
* \brief Utility for fast column-wise access
|
||||
*/
|
||||
#include "column_matrix.h"
|
||||
|
||||
#include <algorithm> // for transform
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint64_t, uint8_t
|
||||
#include <limits> // for numeric_limits
|
||||
#include <type_traits> // for remove_reference_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "xgboost/base.h" // for bst_feaature_t
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
|
||||
auto const nfeature = gmat.Features();
|
||||
const size_t nrow = gmat.Size();
|
||||
// identify type of each column
|
||||
type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});
|
||||
type_.resize(nfeature);
|
||||
|
||||
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
||||
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
|
||||
@@ -45,7 +34,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
||||
|
||||
// want to compute storage boundary for each feature
|
||||
// using variants of prefix sum scan
|
||||
feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
|
||||
feature_offsets_.resize(nfeature + 1);
|
||||
size_t accum_index = 0;
|
||||
feature_offsets_[0] = accum_index;
|
||||
for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
|
||||
@@ -60,11 +49,9 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
||||
SetTypeSize(gmat.MaxNumBinPerFeat());
|
||||
auto storage_size =
|
||||
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
||||
|
||||
index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
|
||||
|
||||
index_.resize(storage_size, 0);
|
||||
if (!all_dense_column) {
|
||||
row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
|
||||
row_ind_.resize(feature_offsets_[nfeature]);
|
||||
}
|
||||
|
||||
// store least bin id for each feature
|
||||
@@ -72,51 +59,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
||||
|
||||
any_missing_ = !gmat.IsDense();
|
||||
|
||||
missing_ = MissingIndicator{0, false};
|
||||
missing_flags_.clear();
|
||||
}
|
||||
|
||||
// IO procedures for external memory.
|
||||
bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
|
||||
if (!common::ReadVec(fi, &index_)) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &type_)) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &row_ind_)) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &feature_offsets_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common::ReadVec(fi, &missing_.storage)) {
|
||||
return false;
|
||||
}
|
||||
missing_.InitView();
|
||||
|
||||
index_base_ = index_base;
|
||||
if (!fi->Read(&bins_type_size_)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&any_missing_)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
|
||||
std::size_t bytes{0};
|
||||
|
||||
bytes += common::WriteVec(fo, index_);
|
||||
bytes += common::WriteVec(fo, type_);
|
||||
bytes += common::WriteVec(fo, row_ind_);
|
||||
bytes += common::WriteVec(fo, feature_offsets_);
|
||||
bytes += common::WriteVec(fo, missing_.storage);
|
||||
|
||||
bytes += fo->Write(bins_type_size_);
|
||||
bytes += fo->Write(any_missing_);
|
||||
|
||||
return bytes;
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2017-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2017-2022 by Contributors
|
||||
* \file column_matrix.h
|
||||
* \brief Utility for fast column-wise access
|
||||
* \author Philip Cho
|
||||
@@ -8,30 +8,25 @@
|
||||
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
|
||||
#include <dmlc/endian.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint8_t
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility> // for move
|
||||
#include <utility> // std::move
|
||||
#include <vector>
|
||||
|
||||
#include "../data/adapter.h"
|
||||
#include "../data/gradient_index.h"
|
||||
#include "algorithm.h"
|
||||
#include "bitfield.h" // for RBitField8
|
||||
#include "hist_util.h"
|
||||
#include "ref_resource_view.h" // for RefResourceView
|
||||
#include "xgboost/base.h" // for bst_bin_t
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
class ColumnMatrix;
|
||||
class AlignedFileWriteStream;
|
||||
class AlignedResourceReadStream;
|
||||
|
||||
/*! \brief column type */
|
||||
enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };
|
||||
enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };
|
||||
|
||||
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
||||
bin id is stored as index[i] + index_base.
|
||||
@@ -46,12 +41,12 @@ class Column {
|
||||
: index_(index), index_base_(least_bin_idx) {}
|
||||
virtual ~Column() = default;
|
||||
|
||||
[[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
|
||||
bst_bin_t GetGlobalBinIdx(size_t idx) const {
|
||||
return index_base_ + static_cast<bst_bin_t>(index_[idx]);
|
||||
}
|
||||
|
||||
/* returns number of elements in column */
|
||||
[[nodiscard]] size_t Size() const { return index_.size(); }
|
||||
size_t Size() const { return index_.size(); }
|
||||
|
||||
private:
|
||||
/* bin indexes in range [0, max_bins - 1] */
|
||||
@@ -68,7 +63,7 @@ class SparseColumnIter : public Column<BinIdxT> {
|
||||
common::Span<const size_t> row_ind_;
|
||||
size_t idx_;
|
||||
|
||||
[[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }
|
||||
size_t const* RowIndices() const { return row_ind_.data(); }
|
||||
|
||||
public:
|
||||
SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
|
||||
@@ -86,7 +81,7 @@ class SparseColumnIter : public Column<BinIdxT> {
|
||||
SparseColumnIter(SparseColumnIter const&) = delete;
|
||||
SparseColumnIter(SparseColumnIter&&) = default;
|
||||
|
||||
[[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
|
||||
size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
|
||||
bst_bin_t operator[](size_t rid) {
|
||||
const size_t column_size = this->Size();
|
||||
if (!((idx_) < column_size)) {
|
||||
@@ -106,10 +101,6 @@ class SparseColumnIter : public Column<BinIdxT> {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Column stored as a dense vector. It might still contain missing values as
|
||||
* indicated by the missing flags.
|
||||
*/
|
||||
template <typename BinIdxT, bool any_missing>
|
||||
class DenseColumnIter : public Column<BinIdxT> {
|
||||
public:
|
||||
@@ -118,19 +109,17 @@ class DenseColumnIter : public Column<BinIdxT> {
|
||||
private:
|
||||
using Base = Column<BinIdxT>;
|
||||
/* flags for missing values in dense columns */
|
||||
LBitField32 missing_flags_;
|
||||
std::vector<ByteType> const& missing_flags_;
|
||||
size_t feature_offset_;
|
||||
|
||||
public:
|
||||
explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
|
||||
LBitField32 missing_flags, size_t feature_offset)
|
||||
std::vector<ByteType> const& missing_flags, size_t feature_offset)
|
||||
: Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
|
||||
DenseColumnIter(DenseColumnIter const&) = delete;
|
||||
DenseColumnIter(DenseColumnIter&&) = default;
|
||||
|
||||
[[nodiscard]] bool IsMissing(size_t ridx) const {
|
||||
return missing_flags_.Check(feature_offset_ + ridx);
|
||||
}
|
||||
bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }
|
||||
|
||||
bst_bin_t operator[](size_t ridx) const {
|
||||
if (any_missing) {
|
||||
@@ -142,54 +131,12 @@ class DenseColumnIter : public Column<BinIdxT> {
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Column major matrix for gradient index on CPU.
|
||||
*
|
||||
* This matrix contains both dense columns and sparse columns, the type of the column
|
||||
* is controlled by the sparse threshold parameter. When the number of missing values
|
||||
* in a column is below the threshold it's classified as dense column.
|
||||
* \brief Column major matrix for gradient index. This matrix contains both dense column
|
||||
* and sparse column, the type of the column is controlled by sparse threshold. When the
|
||||
* number of missing values in a column is below the threshold it's classified as dense
|
||||
* column.
|
||||
*/
|
||||
class ColumnMatrix {
|
||||
/**
|
||||
* @brief A bit set for indicating whether an element in a dense column is missing.
|
||||
*/
|
||||
struct MissingIndicator {
|
||||
LBitField32 missing;
|
||||
RefResourceView<std::uint32_t> storage;
|
||||
|
||||
MissingIndicator() = default;
|
||||
/**
|
||||
* @param n_elements Size of the bit set
|
||||
* @param init Initialize the indicator to true or false.
|
||||
*/
|
||||
MissingIndicator(std::size_t n_elements, bool init) {
|
||||
auto m_size = missing.ComputeStorageSize(n_elements);
|
||||
storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
|
||||
this->InitView();
|
||||
}
|
||||
/** @brief Set the i^th element to be a valid element (instead of missing). */
|
||||
void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
|
||||
/** @brief assign the storage to the view. */
|
||||
void InitView() {
|
||||
missing = LBitField32{Span{storage.data(), storage.size()}};
|
||||
}
|
||||
|
||||
void GrowTo(std::size_t n_elements, bool init) {
|
||||
CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
|
||||
<< "[Internal Error]: Cannot grow the vector when external memory is used.";
|
||||
auto m_size = missing.ComputeStorageSize(n_elements);
|
||||
CHECK_GE(m_size, storage.size());
|
||||
if (m_size == storage.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto new_storage =
|
||||
common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
|
||||
std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
|
||||
storage = std::move(new_storage);
|
||||
this->InitView();
|
||||
}
|
||||
};
|
||||
|
||||
void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);
|
||||
|
||||
template <typename ColumnBinT, typename BinT, typename RIdx>
|
||||
@@ -197,10 +144,9 @@ class ColumnMatrix {
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
||||
begin[rid] = bin_id - index_base_[fid];
|
||||
// not thread-safe with bit field.
|
||||
// FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
|
||||
// flags.
|
||||
missing_.SetValid(feature_offsets_[fid] + rid);
|
||||
// not thread-safe with bool vector. FIXME(jiamingy): We can directly assign
|
||||
// kMissingId to the index to avoid missing flags.
|
||||
missing_flags_[feature_offsets_[fid] + rid] = false;
|
||||
} else {
|
||||
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
|
||||
begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
|
||||
@@ -212,9 +158,7 @@ class ColumnMatrix {
|
||||
public:
|
||||
using ByteType = bool;
|
||||
// get number of features
|
||||
[[nodiscard]] bst_feature_t GetNumFeature() const {
|
||||
return static_cast<bst_feature_t>(type_.size());
|
||||
}
|
||||
bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
|
||||
|
||||
ColumnMatrix() = default;
|
||||
ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
|
||||
@@ -222,7 +166,7 @@ class ColumnMatrix {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
|
||||
* \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
|
||||
* SparsePage.
|
||||
*/
|
||||
void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
|
||||
@@ -234,8 +178,8 @@ class ColumnMatrix {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
|
||||
* data.
|
||||
* \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
|
||||
* data.
|
||||
*
|
||||
* This function requires a binary search for each bin to get back the feature index
|
||||
* for those bins.
|
||||
@@ -255,7 +199,7 @@ class ColumnMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] bool IsInitialized() const { return !type_.empty(); }
|
||||
bool IsInitialized() const { return !type_.empty(); }
|
||||
|
||||
/**
|
||||
* \brief Push batch of data for Quantile DMatrix support.
|
||||
@@ -313,7 +257,7 @@ class ColumnMatrix {
|
||||
reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
|
||||
column_size};
|
||||
return std::move(DenseColumnIter<BinIdxType, any_missing>{
|
||||
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
|
||||
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
|
||||
}
|
||||
|
||||
// all columns are dense column and has no missing value
|
||||
@@ -321,8 +265,7 @@ class ColumnMatrix {
|
||||
template <typename RowBinIdxT>
|
||||
void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
|
||||
const size_t n_features, int32_t n_threads) {
|
||||
missing_.GrowTo(feature_offsets_[n_features], false);
|
||||
|
||||
missing_flags_.resize(feature_offsets_[n_features], false);
|
||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||
using ColumnBinT = decltype(t);
|
||||
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
|
||||
@@ -347,15 +290,9 @@ class ColumnMatrix {
|
||||
void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
|
||||
float missing) {
|
||||
auto n_features = gmat.Features();
|
||||
|
||||
missing_.GrowTo(feature_offsets_[n_features], true);
|
||||
auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
|
||||
if (num_nonzeros_.empty()) {
|
||||
num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
|
||||
} else {
|
||||
CHECK_EQ(num_nonzeros_.size(), n_features);
|
||||
}
|
||||
|
||||
missing_flags_.resize(feature_offsets_[n_features], true);
|
||||
auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
|
||||
num_nonzeros_.resize(n_features, 0);
|
||||
auto is_valid = data::IsValidFunctor{missing};
|
||||
|
||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||
@@ -384,9 +321,8 @@ class ColumnMatrix {
|
||||
*/
|
||||
void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
|
||||
auto n_features = gmat.Features();
|
||||
|
||||
missing_ = MissingIndicator{feature_offsets_[n_features], true};
|
||||
num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
|
||||
missing_flags_.resize(feature_offsets_[n_features], true);
|
||||
num_nonzeros_.resize(n_features, 0);
|
||||
|
||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||
using ColumnBinT = decltype(t);
|
||||
@@ -399,34 +335,106 @@ class ColumnMatrix {
|
||||
});
|
||||
}
|
||||
|
||||
[[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
|
||||
[[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
|
||||
BinTypeSize GetTypeSize() const { return bins_type_size_; }
|
||||
auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
|
||||
|
||||
// And this returns part of state
|
||||
[[nodiscard]] bool AnyMissing() const { return any_missing_; }
|
||||
bool AnyMissing() const { return any_missing_; }
|
||||
|
||||
// IO procedures for external memory.
|
||||
[[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
|
||||
[[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
|
||||
bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
|
||||
fi->Read(&index_);
|
||||
#if !DMLC_LITTLE_ENDIAN
|
||||
// s390x
|
||||
std::vector<std::underlying_type<ColumnType>::type> int_types;
|
||||
fi->Read(&int_types);
|
||||
type_.resize(int_types.size());
|
||||
std::transform(
|
||||
int_types.begin(), int_types.end(), type_.begin(),
|
||||
[](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
|
||||
#else
|
||||
fi->Read(&type_);
|
||||
#endif // !DMLC_LITTLE_ENDIAN
|
||||
|
||||
fi->Read(&row_ind_);
|
||||
fi->Read(&feature_offsets_);
|
||||
|
||||
std::vector<std::uint8_t> missing;
|
||||
fi->Read(&missing);
|
||||
missing_flags_.resize(missing.size());
|
||||
std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
|
||||
[](std::uint8_t flag) { return !!flag; });
|
||||
|
||||
index_base_ = index_base;
|
||||
#if !DMLC_LITTLE_ENDIAN
|
||||
std::underlying_type<BinTypeSize>::type v;
|
||||
fi->Read(&v);
|
||||
bins_type_size_ = static_cast<BinTypeSize>(v);
|
||||
#else
|
||||
fi->Read(&bins_type_size_);
|
||||
#endif
|
||||
|
||||
fi->Read(&any_missing_);
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t Write(dmlc::Stream* fo) const {
|
||||
size_t bytes{0};
|
||||
|
||||
auto write_vec = [&](auto const& vec) {
|
||||
fo->Write(vec);
|
||||
bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
|
||||
sizeof(uint64_t);
|
||||
};
|
||||
write_vec(index_);
|
||||
#if !DMLC_LITTLE_ENDIAN
|
||||
// s390x
|
||||
std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
|
||||
std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
|
||||
return static_cast<std::underlying_type<ColumnType>::type>(t);
|
||||
});
|
||||
write_vec(int_types);
|
||||
#else
|
||||
write_vec(type_);
|
||||
#endif // !DMLC_LITTLE_ENDIAN
|
||||
write_vec(row_ind_);
|
||||
write_vec(feature_offsets_);
|
||||
// dmlc can not handle bool vector
|
||||
std::vector<std::uint8_t> missing(missing_flags_.size());
|
||||
std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
|
||||
[](bool flag) { return static_cast<std::uint8_t>(flag); });
|
||||
write_vec(missing);
|
||||
|
||||
#if !DMLC_LITTLE_ENDIAN
|
||||
auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
|
||||
fo->Write(v);
|
||||
#else
|
||||
fo->Write(bins_type_size_);
|
||||
#endif // DMLC_LITTLE_ENDIAN
|
||||
bytes += sizeof(bins_type_size_);
|
||||
fo->Write(any_missing_);
|
||||
bytes += sizeof(any_missing_);
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private:
|
||||
RefResourceView<std::uint8_t> index_;
|
||||
std::vector<uint8_t> index_;
|
||||
|
||||
RefResourceView<ColumnType> type_;
|
||||
/** @brief indptr of a CSC matrix. */
|
||||
RefResourceView<std::size_t> row_ind_;
|
||||
/** @brief indicate where each column's index and row_ind is stored. */
|
||||
RefResourceView<std::size_t> feature_offsets_;
|
||||
/** @brief The number of nnz of each column. */
|
||||
RefResourceView<std::size_t> num_nonzeros_;
|
||||
std::vector<ColumnType> type_;
|
||||
/* indptr of a CSC matrix. */
|
||||
std::vector<size_t> row_ind_;
|
||||
/* indicate where each column's index and row_ind is stored. */
|
||||
std::vector<size_t> feature_offsets_;
|
||||
/* The number of nnz of each column. */
|
||||
std::vector<size_t> num_nonzeros_;
|
||||
|
||||
// index_base_[fid]: least bin id for feature fid
|
||||
std::uint32_t const* index_base_;
|
||||
|
||||
MissingIndicator missing_;
|
||||
|
||||
uint32_t const* index_base_;
|
||||
std::vector<ByteType> missing_flags_;
|
||||
BinTypeSize bins_type_size_;
|
||||
bool any_missing_;
|
||||
};
|
||||
} // namespace xgboost::common
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
|
||||
@@ -6,11 +6,6 @@
|
||||
#ifndef XGBOOST_COMMON_ERROR_MSG_H_
|
||||
#define XGBOOST_COMMON_ERROR_MSG_H_
|
||||
|
||||
#include <cinttypes> // for uint64_t
|
||||
#include <limits> // for numeric_limits
|
||||
|
||||
#include "xgboost/base.h" // for bst_feature_t
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
|
||||
namespace xgboost::error {
|
||||
@@ -38,46 +33,5 @@ constexpr StringView InconsistentMaxBin() {
|
||||
return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
|
||||
"and consistent with the Booster being trained.";
|
||||
}
|
||||
|
||||
constexpr StringView UnknownDevice() { return "Unknown device type."; }
|
||||
|
||||
inline void MaxFeatureSize(std::uint64_t n_features) {
|
||||
auto max_n_features = std::numeric_limits<bst_feature_t>::max();
|
||||
CHECK_LE(n_features, max_n_features)
|
||||
<< "Unfortunately, XGBoost does not support data matrices with "
|
||||
<< std::numeric_limits<bst_feature_t>::max() << " features or greater";
|
||||
}
|
||||
|
||||
constexpr StringView InplacePredictProxy() {
|
||||
return "Inplace predict accepts only DMatrixProxy as input.";
|
||||
}
|
||||
|
||||
inline void MaxSampleSize(std::size_t n) {
|
||||
LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
|
||||
<< ". Consider using a different updater or tree_method.";
|
||||
}
|
||||
|
||||
constexpr StringView OldSerialization() {
|
||||
return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
|
||||
configuration generated by an older version of XGBoost, please export the model by calling
|
||||
`Booster.save_model` from that version first, then load it back in current version. See:
|
||||
|
||||
https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
|
||||
|
||||
for more details about differences between saving model and serializing.
|
||||
)doc";
|
||||
}
|
||||
|
||||
inline void WarnOldSerialization() {
|
||||
// Display it once is enough. Otherwise this can be really verbose in distributed
|
||||
// environments.
|
||||
static thread_local bool logged{false};
|
||||
if (logged) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOG(WARNING) << OldSerialization();
|
||||
logged = true;
|
||||
}
|
||||
} // namespace xgboost::error
|
||||
#endif // XGBOOST_COMMON_ERROR_MSG_H_
|
||||
|
||||
@@ -127,76 +127,55 @@ void SortByWeight(dh::device_vector<float>* weights,
|
||||
});
|
||||
}
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
|
||||
void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
||||
info.feature_types.SetDevice(device);
|
||||
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
||||
CHECK(!d_feature_types.empty());
|
||||
auto& column_sizes_scan = *p_column_sizes_scan;
|
||||
auto& sorted_entries = *p_sorted_entries;
|
||||
auto &column_sizes_scan = *p_column_sizes_scan;
|
||||
auto &sorted_entries = *p_sorted_entries;
|
||||
// Removing duplicated entries in categorical features.
|
||||
|
||||
// We don't need to accumulate weight for duplicated entries as there's no weighted
|
||||
// sketching for categorical features, the categories are the cut values.
|
||||
dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
|
||||
std::size_t n_uniques{0};
|
||||
if (p_sorted_weights) {
|
||||
using Pair = thrust::tuple<Entry, float>;
|
||||
auto d_sorted_entries = dh::ToSpan(sorted_entries);
|
||||
auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
|
||||
auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
|
||||
auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
|
||||
n_uniques = dh::SegmentedUnique(
|
||||
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
|
||||
val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
|
||||
[=] __device__(Pair const& l, Pair const& r) {
|
||||
Entry const& le = thrust::get<0>(l);
|
||||
Entry const& re = thrust::get<0>(r);
|
||||
if (le.index == re.index && IsCat(d_feature_types, le.index)) {
|
||||
return le.fvalue == re.fvalue;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
p_sorted_weights->resize(n_uniques);
|
||||
} else {
|
||||
n_uniques = dh::SegmentedUnique(
|
||||
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
|
||||
sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
|
||||
sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
|
||||
if (l.index == r.index) {
|
||||
if (IsCat(d_feature_types, l.index)) {
|
||||
return l.fvalue == r.fvalue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
sorted_entries.resize(n_uniques);
|
||||
dh::SegmentedUnique(column_sizes_scan.data().get(),
|
||||
column_sizes_scan.data().get() + column_sizes_scan.size(),
|
||||
sorted_entries.begin(), sorted_entries.end(),
|
||||
new_column_scan.data().get(), sorted_entries.begin(),
|
||||
[=] __device__(Entry const &l, Entry const &r) {
|
||||
if (l.index == r.index) {
|
||||
if (IsCat(d_feature_types, l.index)) {
|
||||
return l.fvalue == r.fvalue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
// Renew the column scan and cut scan based on categorical data.
|
||||
auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
|
||||
dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
|
||||
dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
|
||||
info.num_col_ + 1);
|
||||
CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
|
||||
dh::LaunchN(new_column_scan.size(),
|
||||
[=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
|
||||
d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
|
||||
d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
|
||||
d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
|
||||
if (idx == d_new_columns_ptr.size() - 1) {
|
||||
return;
|
||||
}
|
||||
if (IsCat(d_feature_types, idx)) {
|
||||
// Cut size is the same as number of categories in input.
|
||||
d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
|
||||
} else {
|
||||
d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
|
||||
}
|
||||
});
|
||||
dh::LaunchN(
|
||||
new_column_scan.size(),
|
||||
[=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
|
||||
d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
|
||||
d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
|
||||
d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
|
||||
if (idx == d_new_columns_ptr.size() - 1) {
|
||||
return;
|
||||
}
|
||||
if (IsCat(d_feature_types, idx)) {
|
||||
// Cut size is the same as number of categories in input.
|
||||
d_new_cuts_size[idx] =
|
||||
d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
|
||||
} else {
|
||||
d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
|
||||
}
|
||||
});
|
||||
// Turn size into ptr.
|
||||
thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
|
||||
d_cuts_ptr.data());
|
||||
thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
|
||||
new_cuts_size.cend(), d_cuts_ptr.data());
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
@@ -230,8 +209,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
|
||||
&column_sizes_scan);
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
@@ -297,8 +276,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
|
||||
&column_sizes_scan);
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
if (sketch_container->HasCategorical()) {
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
|
||||
&column_sizes_scan);
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
|
||||
@@ -240,10 +240,10 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
|
||||
void SortByWeight(dh::device_vector<float>* weights,
|
||||
dh::device_vector<Entry>* sorted_entries);
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
||||
void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan);
|
||||
} // namespace detail
|
||||
|
||||
// Compute sketch on DMatrix.
|
||||
@@ -275,8 +275,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
|
||||
&column_sizes_scan);
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
@@ -354,8 +354,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
|
||||
&column_sizes_scan);
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
|
||||
@@ -84,7 +84,7 @@ class HistogramCuts {
|
||||
return *this;
|
||||
}
|
||||
|
||||
[[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
|
||||
uint32_t FeatureBins(bst_feature_t feature) const {
|
||||
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
|
||||
}
|
||||
|
||||
@@ -92,8 +92,8 @@ class HistogramCuts {
|
||||
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
|
||||
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
|
||||
|
||||
[[nodiscard]] bool HasCategorical() const { return has_categorical_; }
|
||||
[[nodiscard]] float MaxCategory() const { return max_cat_; }
|
||||
bool HasCategorical() const { return has_categorical_; }
|
||||
float MaxCategory() const { return max_cat_; }
|
||||
/**
|
||||
* \brief Set meta info about categorical features.
|
||||
*
|
||||
@@ -105,13 +105,12 @@ class HistogramCuts {
|
||||
max_cat_ = max_cat;
|
||||
}
|
||||
|
||||
[[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
|
||||
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
|
||||
|
||||
// Return the index of a cut point that is strictly greater than the input
|
||||
// value, or the last available index if none exists
|
||||
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
|
||||
std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& values) const {
|
||||
bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& values) const {
|
||||
auto end = ptrs[column_id + 1];
|
||||
auto beg = ptrs[column_id];
|
||||
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
|
||||
@@ -120,20 +119,20 @@ class HistogramCuts {
|
||||
return idx;
|
||||
}
|
||||
|
||||
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
|
||||
bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
|
||||
return this->SearchBin(value, column_id, Ptrs(), Values());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Search the bin index for numerical feature.
|
||||
*/
|
||||
[[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
|
||||
bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
|
||||
|
||||
/**
|
||||
* \brief Search the bin index for categorical feature.
|
||||
*/
|
||||
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
|
||||
std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& vals) const {
|
||||
bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
|
||||
std::vector<float> const& vals) const {
|
||||
auto end = ptrs.at(fidx + 1) + vals.cbegin();
|
||||
auto beg = ptrs[fidx] + vals.cbegin();
|
||||
// Truncates the value in case it's not perfectly rounded.
|
||||
@@ -144,14 +143,12 @@ class HistogramCuts {
|
||||
}
|
||||
return bin_idx;
|
||||
}
|
||||
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
|
||||
bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
|
||||
auto const& ptrs = this->Ptrs();
|
||||
auto const& vals = this->Values();
|
||||
return this->SearchCatBin(value, fidx, ptrs, vals);
|
||||
}
|
||||
[[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
|
||||
return SearchCatBin(e.fvalue, e.index);
|
||||
}
|
||||
bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }
|
||||
|
||||
/**
|
||||
* \brief Return numerical bin value given bin index.
|
||||
@@ -203,33 +200,13 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Optionally compressed gradient index. The compression works only with dense
|
||||
* \brief Optionally compressed gradient index. The compression works only with dense
|
||||
* data.
|
||||
*
|
||||
* The main body of construction code is in gradient_index.cc, this struct is only a
|
||||
* view class.
|
||||
* storage class.
|
||||
*/
|
||||
class Index {
|
||||
private:
|
||||
void SetBinTypeSize(BinTypeSize binTypeSize) {
|
||||
binTypeSize_ = binTypeSize;
|
||||
switch (binTypeSize) {
|
||||
case kUint8BinsTypeSize:
|
||||
func_ = &GetValueFromUint8;
|
||||
break;
|
||||
case kUint16BinsTypeSize:
|
||||
func_ = &GetValueFromUint16;
|
||||
break;
|
||||
case kUint32BinsTypeSize:
|
||||
func_ = &GetValueFromUint32;
|
||||
break;
|
||||
default:
|
||||
CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
|
||||
binTypeSize == kUint32BinsTypeSize);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
struct Index {
|
||||
// Inside the compressor, bin_idx is the index for cut value across all features. By
|
||||
// subtracting it with starting pointer of each feature, we can reduce it to smaller
|
||||
// value and store it with smaller types. Usable only with dense data.
|
||||
@@ -253,24 +230,10 @@ class Index {
|
||||
}
|
||||
|
||||
Index() { SetBinTypeSize(binTypeSize_); }
|
||||
|
||||
Index(Index const& i) = delete;
|
||||
Index& operator=(Index const& i) = delete;
|
||||
Index(const Index& i) = delete;
|
||||
Index& operator=(Index i) = delete;
|
||||
Index(Index&& i) = delete;
|
||||
|
||||
/** @brief Move assignment for lazy initialization. */
|
||||
Index& operator=(Index&& i) = default;
|
||||
|
||||
/**
|
||||
* @brief Construct the index from data.
|
||||
*
|
||||
* @param data Storage for compressed histogram bin.
|
||||
* @param bin_size Number of bytes for each bin.
|
||||
*/
|
||||
Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
|
||||
this->SetBinTypeSize(bin_size);
|
||||
}
|
||||
|
||||
Index& operator=(Index&& i) = delete;
|
||||
uint32_t operator[](size_t i) const {
|
||||
if (!bin_offset_.empty()) {
|
||||
// dense, compressed
|
||||
@@ -281,7 +244,26 @@ class Index {
|
||||
return func_(data_.data(), i);
|
||||
}
|
||||
}
|
||||
[[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
|
||||
void SetBinTypeSize(BinTypeSize binTypeSize) {
|
||||
binTypeSize_ = binTypeSize;
|
||||
switch (binTypeSize) {
|
||||
case kUint8BinsTypeSize:
|
||||
func_ = &GetValueFromUint8;
|
||||
break;
|
||||
case kUint16BinsTypeSize:
|
||||
func_ = &GetValueFromUint16;
|
||||
break;
|
||||
case kUint32BinsTypeSize:
|
||||
func_ = &GetValueFromUint32;
|
||||
break;
|
||||
default:
|
||||
CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
|
||||
binTypeSize == kUint32BinsTypeSize);
|
||||
}
|
||||
}
|
||||
BinTypeSize GetBinTypeSize() const {
|
||||
return binTypeSize_;
|
||||
}
|
||||
template <typename T>
|
||||
T const* data() const { // NOLINT
|
||||
return reinterpret_cast<T const*>(data_.data());
|
||||
@@ -290,27 +272,30 @@ class Index {
|
||||
T* data() { // NOLINT
|
||||
return reinterpret_cast<T*>(data_.data());
|
||||
}
|
||||
[[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
|
||||
[[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
|
||||
[[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }
|
||||
uint32_t const* Offset() const { return bin_offset_.data(); }
|
||||
size_t OffsetSize() const { return bin_offset_.size(); }
|
||||
size_t Size() const { return data_.size() / (binTypeSize_); }
|
||||
|
||||
void Resize(const size_t n_bytes) {
|
||||
data_.resize(n_bytes);
|
||||
}
|
||||
// set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
|
||||
void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
|
||||
bin_offset_.resize(cut_ptrs.size() - 1); // resize to number of features.
|
||||
std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
|
||||
}
|
||||
auto begin() const { // NOLINT
|
||||
return data_.data();
|
||||
std::vector<uint8_t>::const_iterator begin() const { // NOLINT
|
||||
return data_.begin();
|
||||
}
|
||||
auto end() const { // NOLINT
|
||||
return data_.data() + data_.size();
|
||||
std::vector<uint8_t>::const_iterator end() const { // NOLINT
|
||||
return data_.end();
|
||||
}
|
||||
|
||||
auto begin() { // NOLINT
|
||||
return data_.data();
|
||||
std::vector<uint8_t>::iterator begin() { // NOLINT
|
||||
return data_.begin();
|
||||
}
|
||||
auto end() { // NOLINT
|
||||
return data_.data() + data_.size();
|
||||
std::vector<uint8_t>::iterator end() { // NOLINT
|
||||
return data_.end();
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -325,12 +310,12 @@ class Index {
|
||||
|
||||
using Func = uint32_t (*)(uint8_t const*, size_t);
|
||||
|
||||
Span<std::uint8_t> data_;
|
||||
std::vector<uint8_t> data_;
|
||||
// starting position of each feature inside the cut values (the indptr of the CSC cut matrix
|
||||
// HistogramCuts without the last entry.) Used for bin compression.
|
||||
std::vector<uint32_t> bin_offset_;
|
||||
|
||||
BinTypeSize binTypeSize_{kUint8BinsTypeSize};
|
||||
BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
|
||||
Func func_;
|
||||
};
|
||||
|
||||
|
||||
236
src/common/io.cc
236
src/common/io.cc
@@ -1,47 +1,24 @@
|
||||
/**
|
||||
* Copyright 2019-2023, by XGBoost Contributors
|
||||
/*!
|
||||
* Copyright (c) by XGBoost Contributors 2019-2022
|
||||
*/
|
||||
#if !defined(NOMINMAX) && defined(_WIN32)
|
||||
#define NOMINMAX
|
||||
#endif // !defined(NOMINMAX)
|
||||
|
||||
#if !defined(xgboost_IS_WIN)
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define xgboost_IS_WIN 1
|
||||
#endif // defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
||||
#endif // !defined(xgboost_IS_WIN)
|
||||
|
||||
#if defined(__unix__) || defined(__APPLE__)
|
||||
#include <fcntl.h> // for open, O_RDONLY
|
||||
#include <sys/mman.h> // for mmap, mmap64, munmap
|
||||
#include <unistd.h> // for close, getpagesize
|
||||
#elif defined(xgboost_IS_WIN)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#if defined(__unix__)
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#endif // defined(__unix__)
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <cstdio>
|
||||
|
||||
#include <algorithm> // for copy, transform
|
||||
#include <cctype> // for tolower
|
||||
#include <cerrno> // for errno
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t, uint32_t
|
||||
#include <cstring> // for memcpy
|
||||
#include <fstream> // for ifstream
|
||||
#include <iterator> // for distance
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for unique_ptr
|
||||
#include <string> // for string
|
||||
#include <system_error> // for error_code, system_category
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "io.h"
|
||||
#include "xgboost/collective/socket.h" // for LastError
|
||||
#include "xgboost/logging.h"
|
||||
#include "io.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
namespace xgboost::common {
|
||||
size_t PeekableInStream::Read(void* dptr, size_t size) {
|
||||
size_t nbuffer = buffer_.length() - buffer_ptr_;
|
||||
if (nbuffer == 0) return strm_->Read(dptr, size);
|
||||
@@ -117,32 +94,11 @@ void FixedSizeStream::Take(std::string* out) {
|
||||
*out = std::move(buffer_);
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Get system alignment value for IO with mmap.
|
||||
std::size_t GetMmapAlignment() {
|
||||
#if defined(xgboost_IS_WIN)
|
||||
SYSTEM_INFO sys_info;
|
||||
GetSystemInfo(&sys_info);
|
||||
// During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
|
||||
// size 65536.
|
||||
return sys_info.dwAllocationGranularity;
|
||||
#else
|
||||
return getpagesize();
|
||||
#endif
|
||||
}
|
||||
|
||||
auto SystemErrorMsg() {
|
||||
std::int32_t errsv = system::LastError();
|
||||
auto err = std::error_code{errsv, std::system_category()};
|
||||
return err.message();
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
std::string LoadSequentialFile(std::string uri, bool stream) {
|
||||
auto OpenErr = [&uri]() {
|
||||
std::string msg;
|
||||
msg = "Opening " + uri + " failed: ";
|
||||
msg += SystemErrorMsg();
|
||||
msg += strerror(errno);
|
||||
LOG(FATAL) << msg;
|
||||
};
|
||||
|
||||
@@ -199,159 +155,5 @@ std::string FileExtension(std::string fname, bool lower) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
|
||||
// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
|
||||
ResourceHandler::~ResourceHandler() noexcept(false) {} // NOLINT
|
||||
|
||||
struct MMAPFile {
|
||||
#if defined(xgboost_IS_WIN)
|
||||
HANDLE fd{INVALID_HANDLE_VALUE};
|
||||
HANDLE file_map{INVALID_HANDLE_VALUE};
|
||||
#else
|
||||
std::int32_t fd{0};
|
||||
#endif
|
||||
std::byte* base_ptr{nullptr};
|
||||
std::size_t base_size{0};
|
||||
std::size_t delta{0};
|
||||
std::string path;
|
||||
|
||||
MMAPFile() = default;
|
||||
|
||||
#if defined(xgboost_IS_WIN)
|
||||
MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
|
||||
std::string path)
|
||||
: fd{fd},
|
||||
file_map{fm},
|
||||
base_ptr{base_ptr},
|
||||
base_size{base_size},
|
||||
delta{delta},
|
||||
path{std::move(path)} {}
|
||||
#else
|
||||
MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
|
||||
std::string path)
|
||||
: fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
|
||||
#endif
|
||||
};
|
||||
|
||||
std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
|
||||
if (length == 0) {
|
||||
return std::make_unique<MMAPFile>();
|
||||
}
|
||||
|
||||
#if defined(xgboost_IS_WIN)
|
||||
HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
|
||||
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
|
||||
CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
|
||||
#else
|
||||
auto fd = open(path.c_str(), O_RDONLY);
|
||||
CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
|
||||
#endif
|
||||
|
||||
std::byte* ptr{nullptr};
|
||||
// Round down for alignment.
|
||||
auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
|
||||
auto view_size = length + (offset - view_start);
|
||||
|
||||
#if defined(__linux__) || defined(__GLIBC__)
|
||||
int prot{PROT_READ};
|
||||
ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
|
||||
madvise(ptr, view_size, MADV_WILLNEED);
|
||||
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
auto handle =
|
||||
std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
|
||||
#elif defined(xgboost_IS_WIN)
|
||||
auto file_size = GetFileSize(fd, nullptr);
|
||||
DWORD access = PAGE_READONLY;
|
||||
auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
|
||||
access = FILE_MAP_READ;
|
||||
std::uint32_t loff = static_cast<std::uint32_t>(view_start);
|
||||
std::uint32_t hoff = view_start >> 32;
|
||||
CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
|
||||
CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
|
||||
std::move(path));
|
||||
#else
|
||||
CHECK_LE(offset, std::numeric_limits<off_t>::max())
|
||||
<< "File size has exceeded the limit on the current system.";
|
||||
int prot{PROT_READ};
|
||||
ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
|
||||
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
|
||||
auto handle =
|
||||
std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
|
||||
#endif // defined(__linux__)
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
|
||||
: ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
|
||||
|
||||
MmapResource::~MmapResource() noexcept(false) {
|
||||
if (!handle_) {
|
||||
return;
|
||||
}
|
||||
#if defined(xgboost_IS_WIN)
|
||||
if (handle_->base_ptr) {
|
||||
CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->fd != INVALID_HANDLE_VALUE) {
|
||||
CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->file_map != INVALID_HANDLE_VALUE) {
|
||||
CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
|
||||
}
|
||||
#else
|
||||
if (handle_->base_ptr) {
|
||||
CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
|
||||
<< "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
|
||||
}
|
||||
if (handle_->fd != 0) {
|
||||
CHECK_NE(close(handle_->fd), -1)
|
||||
<< "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
[[nodiscard]] void* MmapResource::Data() {
|
||||
if (!handle_) {
|
||||
return nullptr;
|
||||
}
|
||||
return handle_->base_ptr + handle_->delta;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
|
||||
|
||||
// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
|
||||
// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
|
||||
AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {} // NOLINT
|
||||
PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {} // NOLINT
|
||||
|
||||
AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
|
||||
: pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
|
||||
|
||||
[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
|
||||
std::size_t n_bytes) noexcept(true) {
|
||||
pimpl_->Write(ptr, n_bytes);
|
||||
return n_bytes;
|
||||
}
|
||||
|
||||
AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
|
||||
: pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
|
||||
AlignedMemWriteStream::~AlignedMemWriteStream() = default;
|
||||
|
||||
[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
|
||||
std::size_t n_bytes) noexcept(true) {
|
||||
this->pimpl_->Write(ptr, n_bytes);
|
||||
return n_bytes;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
|
||||
return this->pimpl_->Tell();
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
|
||||
#if defined(xgboost_IS_WIN)
|
||||
#undef xgboost_IS_WIN
|
||||
#endif // defined(xgboost_IS_WIN)
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
345
src/common/io.h
345
src/common/io.h
@@ -1,32 +1,23 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright by XGBoost Contributors 2014-2022
|
||||
* \file io.h
|
||||
* \brief general stream interface for serialization, I/O
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
|
||||
#ifndef XGBOOST_COMMON_IO_H_
|
||||
#define XGBOOST_COMMON_IO_H_
|
||||
|
||||
#include <dmlc/io.h>
|
||||
#include <rabit/rabit.h>
|
||||
|
||||
#include <algorithm> // for min
|
||||
#include <array> // for array
|
||||
#include <cstddef> // for byte, size_t
|
||||
#include <cstdlib> // for malloc, realloc, free
|
||||
#include <cstring> // for memcpy
|
||||
#include <fstream> // for ifstream
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for unique_ptr
|
||||
#include <string> // for string
|
||||
#include <type_traits> // for alignment_of_v, enable_if_t
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
|
||||
#include "common.h"
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
|
||||
using MemoryBufferStream = rabit::utils::MemoryBufferStream;
|
||||
|
||||
@@ -65,8 +56,8 @@ class FixedSizeStream : public PeekableInStream {
|
||||
|
||||
size_t Read(void* dptr, size_t size) override;
|
||||
size_t PeekRead(void* dptr, size_t size) override;
|
||||
[[nodiscard]] std::size_t Size() const { return buffer_.size(); }
|
||||
[[nodiscard]] std::size_t Tell() const { return pointer_; }
|
||||
size_t Size() const { return buffer_.size(); }
|
||||
size_t Tell() const { return pointer_; }
|
||||
void Seek(size_t pos);
|
||||
|
||||
void Write(const void*, size_t) override {
|
||||
@@ -136,318 +127,6 @@ inline std::string ReadAll(std::string const &path) {
|
||||
return content;
|
||||
}
|
||||
|
||||
struct MMAPFile;
|
||||
|
||||
/**
|
||||
* @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
|
||||
* fixed once it's constructed. Users cannot use mutable operations like resize
|
||||
* without acquiring the specific resource first.
|
||||
*/
|
||||
class ResourceHandler {
|
||||
public:
|
||||
// RTTI
|
||||
enum Kind : std::uint8_t {
|
||||
kMalloc = 0,
|
||||
kMmap = 1,
|
||||
};
|
||||
|
||||
private:
|
||||
Kind kind_{kMalloc};
|
||||
|
||||
public:
|
||||
virtual void* Data() = 0;
|
||||
template <typename T>
|
||||
[[nodiscard]] T* DataAs() {
|
||||
return reinterpret_cast<T*>(this->Data());
|
||||
}
|
||||
|
||||
[[nodiscard]] virtual std::size_t Size() const = 0;
|
||||
[[nodiscard]] auto Type() const { return kind_; }
|
||||
|
||||
// Allow exceptions for cleaning up resource.
|
||||
virtual ~ResourceHandler() noexcept(false);
|
||||
|
||||
explicit ResourceHandler(Kind kind) : kind_{kind} {}
|
||||
// Use shared_ptr to manage a pool like resource handler. All copy and assignment
|
||||
// operators are disabled.
|
||||
ResourceHandler(ResourceHandler const& that) = delete;
|
||||
ResourceHandler& operator=(ResourceHandler const& that) = delete;
|
||||
ResourceHandler(ResourceHandler&& that) = delete;
|
||||
ResourceHandler& operator=(ResourceHandler&& that) = delete;
|
||||
/**
|
||||
* @brief Wether two resources have the same type. (both malloc or both mmap).
|
||||
*/
|
||||
[[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
|
||||
return this->Type() == that.Type();
|
||||
}
|
||||
};
|
||||
|
||||
class MallocResource : public ResourceHandler {
|
||||
void* ptr_{nullptr};
|
||||
std::size_t n_{0};
|
||||
|
||||
void Clear() noexcept(true) {
|
||||
std::free(ptr_);
|
||||
ptr_ = nullptr;
|
||||
n_ = 0;
|
||||
}
|
||||
|
||||
public:
|
||||
explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
|
||||
~MallocResource() noexcept(true) override { this->Clear(); }
|
||||
|
||||
void* Data() override { return ptr_; }
|
||||
[[nodiscard]] std::size_t Size() const override { return n_; }
|
||||
/**
|
||||
* @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
|
||||
* over malloc.
|
||||
*
|
||||
* @tparam force_malloc Force the use of malloc over realloc. Used for testing.
|
||||
*
|
||||
* @param n_bytes The new size.
|
||||
*/
|
||||
template <bool force_malloc = false>
|
||||
void Resize(std::size_t n_bytes) {
|
||||
// realloc(ptr, 0) works, but is deprecated.
|
||||
if (n_bytes == 0) {
|
||||
this->Clear();
|
||||
return;
|
||||
}
|
||||
|
||||
// If realloc fails, we need to copy the data ourselves.
|
||||
bool need_copy{false};
|
||||
void* new_ptr{nullptr};
|
||||
// use realloc first, it can handle nullptr.
|
||||
if constexpr (!force_malloc) {
|
||||
new_ptr = std::realloc(ptr_, n_bytes);
|
||||
}
|
||||
// retry with malloc if realloc fails
|
||||
if (!new_ptr) {
|
||||
// ptr_ is preserved if realloc fails
|
||||
new_ptr = std::malloc(n_bytes);
|
||||
need_copy = true;
|
||||
}
|
||||
if (!new_ptr) {
|
||||
// malloc fails
|
||||
LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
|
||||
}
|
||||
|
||||
if (need_copy) {
|
||||
std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
|
||||
}
|
||||
// default initialize
|
||||
std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
|
||||
// free the old ptr if malloc is used.
|
||||
if (need_copy) {
|
||||
this->Clear();
|
||||
}
|
||||
|
||||
ptr_ = new_ptr;
|
||||
n_ = n_bytes;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A class for wrapping mmap as a resource for RAII.
|
||||
*/
|
||||
class MmapResource : public ResourceHandler {
|
||||
std::unique_ptr<MMAPFile> handle_;
|
||||
std::size_t n_;
|
||||
|
||||
public:
|
||||
MmapResource(std::string path, std::size_t offset, std::size_t length);
|
||||
~MmapResource() noexcept(false) override;
|
||||
|
||||
[[nodiscard]] void* Data() override;
|
||||
[[nodiscard]] std::size_t Size() const override;
|
||||
};
|
||||
|
||||
/**
|
||||
* @param Alignment for resource read stream and aligned write stream.
|
||||
*/
|
||||
constexpr std::size_t IOAlignment() {
|
||||
// For most of the pod types in XGBoost, 8 byte is sufficient.
|
||||
return 8;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wrap resource into a dmlc stream.
|
||||
*
|
||||
* This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
|
||||
* method or the `Consume()` method. The former copies data into output, while the latter
|
||||
* makes copy only if it's a primitive type.
|
||||
*
|
||||
* Input is required to be aligned to IOAlignment().
|
||||
*/
|
||||
class AlignedResourceReadStream {
|
||||
std::shared_ptr<ResourceHandler> resource_;
|
||||
std::size_t curr_ptr_{0};
|
||||
|
||||
// Similar to SEEK_END in libc
|
||||
static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
public:
|
||||
explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
|
||||
: resource_{std::move(resource)} {}
|
||||
|
||||
[[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
|
||||
/**
|
||||
* @brief Consume n_bytes of data, no copying is performed.
|
||||
*
|
||||
* @return A pair with the beginning pointer and the number of available bytes, which
|
||||
* may be smaller than requested.
|
||||
*/
|
||||
[[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
|
||||
auto res_size = resource_->Size();
|
||||
auto data = reinterpret_cast<std::byte*>(resource_->Data());
|
||||
auto ptr = data + curr_ptr_;
|
||||
|
||||
// Move the cursor
|
||||
auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
|
||||
auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
|
||||
std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
|
||||
|
||||
curr_ptr_ += aligned_forward;
|
||||
|
||||
return std::pair{ptr, forward};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
|
||||
auto [ptr, size] = this->Consume(sizeof(T));
|
||||
if (size != sizeof(T)) {
|
||||
return false;
|
||||
}
|
||||
CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
|
||||
*out = *reinterpret_cast<T*>(ptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
[[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
|
||||
/**
|
||||
* @brief Read n_bytes of data, output is copied into ptr.
|
||||
*/
|
||||
[[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
|
||||
auto [res_ptr, forward] = this->Consume(n_bytes);
|
||||
if (forward != 0) {
|
||||
std::memcpy(ptr, res_ptr, forward);
|
||||
}
|
||||
return forward;
|
||||
}
|
||||
/**
|
||||
* @brief Read a primitive type.
|
||||
*
|
||||
* @return Whether the read is successful.
|
||||
*/
|
||||
template <typename T>
|
||||
[[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
|
||||
return this->Consume(out);
|
||||
}
|
||||
/**
|
||||
* @brief Read a vector.
|
||||
*
|
||||
* @return Whether the read is successful.
|
||||
*/
|
||||
template <typename T>
|
||||
[[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
|
||||
std::uint64_t n{0};
|
||||
if (!this->Consume(&n)) {
|
||||
return false;
|
||||
}
|
||||
out->resize(n);
|
||||
|
||||
auto n_bytes = sizeof(T) * n;
|
||||
if (this->Read(out->data(), n_bytes) != n_bytes) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual ~AlignedResourceReadStream() noexcept(false);
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Private mmap file as a read-only stream.
|
||||
*
|
||||
* It can calculate alignment automatically based on system page size (or allocation
|
||||
* granularity on Windows).
|
||||
*
|
||||
* The file is required to be aligned by IOAlignment().
|
||||
*/
|
||||
class PrivateMmapConstStream : public AlignedResourceReadStream {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a private mmap stream.
|
||||
*
|
||||
* @param path File path.
|
||||
* @param offset See the `offset` parameter of `mmap` for details.
|
||||
* @param length See the `length` parameter of `mmap` for details.
|
||||
*/
|
||||
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
|
||||
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
|
||||
~PrivateMmapConstStream() noexcept(false) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Base class for write stream with alignment defined by IOAlignment().
|
||||
*/
|
||||
class AlignedWriteStream {
|
||||
protected:
|
||||
[[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
|
||||
std::size_t n_bytes) noexcept(true) = 0;
|
||||
|
||||
public:
|
||||
virtual ~AlignedWriteStream() = default;
|
||||
|
||||
[[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
|
||||
auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
|
||||
auto w_n_bytes = this->DoWrite(ptr, n_bytes);
|
||||
CHECK_EQ(w_n_bytes, n_bytes);
|
||||
auto remaining = aligned_n_bytes - n_bytes;
|
||||
if (remaining > 0) {
|
||||
std::array<std::uint8_t, IOAlignment()> padding;
|
||||
std::memset(padding.data(), '\0', padding.size());
|
||||
w_n_bytes = this->DoWrite(padding.data(), remaining);
|
||||
CHECK_EQ(w_n_bytes, remaining);
|
||||
}
|
||||
return aligned_n_bytes;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
|
||||
return this->Write(&v, sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
|
||||
*/
|
||||
class AlignedFileWriteStream : public AlignedWriteStream {
|
||||
std::unique_ptr<dmlc::Stream> pimpl_;
|
||||
|
||||
protected:
|
||||
[[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
|
||||
|
||||
public:
|
||||
AlignedFileWriteStream() = default;
|
||||
AlignedFileWriteStream(StringView path, StringView flags);
|
||||
~AlignedFileWriteStream() override = default;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
|
||||
*/
|
||||
class AlignedMemWriteStream : public AlignedFileWriteStream {
|
||||
std::unique_ptr<MemoryBufferStream> pimpl_;
|
||||
|
||||
protected:
|
||||
[[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
|
||||
|
||||
public:
|
||||
explicit AlignedMemWriteStream(std::string* p_buf);
|
||||
~AlignedMemWriteStream() override;
|
||||
|
||||
[[nodiscard]] std::size_t Tell() const noexcept(true);
|
||||
};
|
||||
} // namespace xgboost::common
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_IO_H_
|
||||
|
||||
@@ -1,158 +0,0 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
||||
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
||||
|
||||
#include <algorithm> // for fill_n
|
||||
#include <cstdint> // for uint64_t
|
||||
#include <cstring> // for memcpy
|
||||
#include <memory> // for shared_ptr, make_shared
|
||||
#include <type_traits> // for is_reference_v, remove_reference_t, is_same_v
|
||||
#include <utility> // for swap, move
|
||||
|
||||
#include "io.h" // for ResourceHandler, AlignedResourceReadStream, MallocResource
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::common {
|
||||
/**
|
||||
* @brief A vector-like type that holds a reference counted resource.
|
||||
*
|
||||
* The vector size is immutable after construction. This way we can swap the underlying
|
||||
* resource when needed.
|
||||
*/
|
||||
template <typename T>
|
||||
class RefResourceView {
|
||||
static_assert(!std::is_reference_v<T>);
|
||||
|
||||
public:
|
||||
using value_type = T; // NOLINT
|
||||
using size_type = std::uint64_t; // NOLINT
|
||||
|
||||
private:
|
||||
value_type* ptr_{nullptr};
|
||||
size_type size_{0};
|
||||
std::shared_ptr<common::ResourceHandler> mem_{nullptr};
|
||||
|
||||
public:
|
||||
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
|
||||
: ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
|
||||
CHECK_GE(mem_->Size(), n);
|
||||
}
|
||||
/**
|
||||
* @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
|
||||
*
|
||||
* @param ptr The pointer to view.
|
||||
* @param n The length of the view.
|
||||
* @param mem The owner of the pointer.
|
||||
* @param init Initialize the view with this value.
|
||||
*/
|
||||
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
|
||||
T const& init)
|
||||
: RefResourceView{ptr, n, mem} {
|
||||
if (n != 0) {
|
||||
std::fill_n(ptr_, n, init);
|
||||
}
|
||||
}
|
||||
|
||||
~RefResourceView() = default;
|
||||
|
||||
RefResourceView() = default;
|
||||
RefResourceView(RefResourceView const& that) = delete;
|
||||
RefResourceView(RefResourceView&& that) = delete;
|
||||
RefResourceView& operator=(RefResourceView const& that) = delete;
|
||||
/**
|
||||
* @brief We allow move assignment for lazy initialization.
|
||||
*/
|
||||
RefResourceView& operator=(RefResourceView&& that) = default;
|
||||
|
||||
[[nodiscard]] size_type size() const { return size_; } // NOLINT
|
||||
[[nodiscard]] size_type size_bytes() const { // NOLINT
|
||||
return Span{data(), size()}.size_bytes();
|
||||
}
|
||||
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
|
||||
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
|
||||
[[nodiscard]] bool empty() const { return size() == 0; } // NOLINT
|
||||
|
||||
[[nodiscard]] auto cbegin() const { return data(); } // NOLINT
|
||||
[[nodiscard]] auto begin() { return data(); } // NOLINT
|
||||
[[nodiscard]] auto begin() const { return cbegin(); } // NOLINT
|
||||
[[nodiscard]] auto cend() const { return data() + size(); } // NOLINT
|
||||
[[nodiscard]] auto end() { return data() + size(); } // NOLINT
|
||||
[[nodiscard]] auto end() const { return cend(); } // NOLINT
|
||||
|
||||
[[nodiscard]] auto const& front() const { return data()[0]; } // NOLINT
|
||||
[[nodiscard]] auto& front() { return data()[0]; } // NOLINT
|
||||
[[nodiscard]] auto const& back() const { return data()[size() - 1]; } // NOLINT
|
||||
[[nodiscard]] auto& back() { return data()[size() - 1]; } // NOLINT
|
||||
|
||||
[[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
|
||||
[[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
|
||||
|
||||
/**
|
||||
* @brief Get the underlying resource.
|
||||
*/
|
||||
auto Resource() const { return mem_; }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
|
||||
*
|
||||
* If the output vector is a referenced counted view, no copying occur.
|
||||
*/
|
||||
template <typename Vec>
|
||||
[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
|
||||
std::uint64_t n{0};
|
||||
if (!fi->Read(&n)) {
|
||||
return false;
|
||||
}
|
||||
if (n == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
using T = typename Vec::value_type;
|
||||
auto expected_bytes = sizeof(T) * n;
|
||||
|
||||
auto [ptr, n_bytes] = fi->Consume(expected_bytes);
|
||||
if (n_bytes != expected_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
|
||||
*vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
|
||||
} else {
|
||||
vec->resize(n);
|
||||
std::memcpy(vec->data(), ptr, n_bytes);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
|
||||
*/
|
||||
template <typename Vec>
|
||||
[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
|
||||
std::size_t bytes{0};
|
||||
auto n = static_cast<std::uint64_t>(vec.size());
|
||||
bytes += fo->Write(n);
|
||||
if (n == 0) {
|
||||
return sizeof(n);
|
||||
}
|
||||
|
||||
using T = typename std::remove_reference_t<decltype(vec)>::value_type;
|
||||
bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Make a fixed size `RefResourceView` with malloc resource.
|
||||
*/
|
||||
template <typename T>
|
||||
[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
|
||||
auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
|
||||
return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
#endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <dmlc/data.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstddef> // std::size_t
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
@@ -17,7 +17,6 @@
|
||||
#include <vector>
|
||||
|
||||
#include "../c_api/c_api_error.h"
|
||||
#include "../common/error_msg.h" // for MaxFeatureSize
|
||||
#include "../common/math.h"
|
||||
#include "array_interface.h"
|
||||
#include "arrow-cdi.h"
|
||||
@@ -301,9 +300,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
|
||||
array_interface_ = ArrayInterface<2>(get<Object const>(j));
|
||||
batch_ = ArrayAdapterBatch{array_interface_};
|
||||
}
|
||||
[[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
|
||||
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
|
||||
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
|
||||
ArrayAdapterBatch const& Value() const override { return batch_; }
|
||||
size_t NumRows() const { return array_interface_.Shape(0); }
|
||||
size_t NumColumns() const { return array_interface_.Shape(1); }
|
||||
|
||||
private:
|
||||
ArrayAdapterBatch batch_;
|
||||
|
||||
@@ -590,7 +590,7 @@ class ArrayInterface {
|
||||
template <std::int32_t D, typename Fn>
|
||||
void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
|
||||
// Only used for cuDF at the moment.
|
||||
CHECK_EQ(array.valid.Capacity(), 0);
|
||||
CHECK_EQ(array.valid.Size(), 0);
|
||||
auto dispatch = [&](auto t) {
|
||||
using T = std::remove_const_t<decltype(t)> const;
|
||||
// Set the data size to max as we don't know the original size of a sliced array:
|
||||
|
||||
@@ -416,8 +416,7 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
|
||||
p_out->Reshape(array.shape);
|
||||
return;
|
||||
}
|
||||
CHECK_EQ(array.valid.Capacity(), 0)
|
||||
<< "Meta info like label or weight can not have missing value.";
|
||||
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
|
||||
if (array.is_contiguous && array.type == ToDType<T>::kType) {
|
||||
// Handle contigious
|
||||
p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
|
||||
|
||||
@@ -33,8 +33,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
p_out->Reshape(array.shape);
|
||||
return;
|
||||
}
|
||||
CHECK_EQ(array.valid.Capacity(), 0)
|
||||
<< "Meta info like label or weight can not have missing value.";
|
||||
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
|
||||
auto ptr_device = SetDeviceToPtr(array.data);
|
||||
p_out->SetDevice(ptr_device);
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#include <thrust/iterator/transform_output_iterator.h>
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/cuda_context.cuh"
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "../common/random.h"
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
@@ -314,8 +313,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
||||
auto d_csc_indptr = dh::ToSpan(csc_indptr);
|
||||
|
||||
auto bin_type = page.index.GetBinTypeSize();
|
||||
common::CompressedBufferWriter writer{page.cut.TotalBins() +
|
||||
static_cast<std::size_t>(1)}; // +1 for null value
|
||||
common::CompressedBufferWriter writer{page.cut.TotalBins() + 1}; // +1 for null value
|
||||
|
||||
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
|
||||
auto ridx = idx / row_stride;
|
||||
@@ -359,10 +357,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
|
||||
// copy gidx
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr);
|
||||
auto d_row_ptr = dh::ToSpan(row_ptr);
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
||||
auto null = accessor.NullValue();
|
||||
|
||||
@@ -1,59 +1,60 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2019-2021 XGBoost contributors
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
|
||||
#include "ellpack_page.cuh"
|
||||
#include "histogram_cut_format.h" // for ReadHistogramCuts, WriteHistogramCuts
|
||||
#include "sparse_page_writer.h" // for SparsePageFormat
|
||||
#include "sparse_page_writer.h"
|
||||
#include "histogram_cut_format.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
|
||||
|
||||
|
||||
class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
||||
public:
|
||||
bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
|
||||
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
|
||||
auto* impl = page->Impl();
|
||||
if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->n_rows)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->is_dense)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->row_stride)) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
fi->Read(&impl->n_rows);
|
||||
fi->Read(&impl->is_dense);
|
||||
fi->Read(&impl->row_stride);
|
||||
fi->Read(&impl->gidx_buffer.HostVector());
|
||||
if (!fi->Read(&impl->base_rowid)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
|
||||
std::size_t bytes{0};
|
||||
size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
|
||||
size_t bytes = 0;
|
||||
auto* impl = page.Impl();
|
||||
bytes += WriteHistogramCuts(impl->Cuts(), fo);
|
||||
bytes += fo->Write(impl->n_rows);
|
||||
bytes += fo->Write(impl->is_dense);
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
fo->Write(impl->n_rows);
|
||||
bytes += sizeof(impl->n_rows);
|
||||
fo->Write(impl->is_dense);
|
||||
bytes += sizeof(impl->is_dense);
|
||||
fo->Write(impl->row_stride);
|
||||
bytes += sizeof(impl->row_stride);
|
||||
CHECK(!impl->gidx_buffer.ConstHostVector().empty());
|
||||
bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
|
||||
bytes += fo->Write(impl->base_rowid);
|
||||
fo->Write(impl->gidx_buffer.HostVector());
|
||||
bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
|
||||
fo->Write(impl->base_rowid);
|
||||
bytes += sizeof(impl->base_rowid);
|
||||
return bytes;
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
|
||||
.describe("Raw ELLPACK binary data format.")
|
||||
.set_body([]() { return new EllpackPageRawFormat(); });
|
||||
} // namespace xgboost::data
|
||||
.set_body([]() {
|
||||
return new EllpackPageRawFormat();
|
||||
});
|
||||
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -29,7 +29,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
|
||||
cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);
|
||||
|
||||
const uint32_t nbins = cut.Ptrs().back();
|
||||
hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(ctx->Threads() * nbins, 0);
|
||||
|
||||
size_t new_size = 1;
|
||||
@@ -37,7 +37,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
|
||||
new_size += batch.Size();
|
||||
}
|
||||
|
||||
row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});
|
||||
row_ptr.resize(new_size);
|
||||
row_ptr[0] = 0;
|
||||
|
||||
const bool isDense = p_fmat->IsDense();
|
||||
this->isDense_ = isDense;
|
||||
@@ -60,8 +61,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
|
||||
|
||||
GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
|
||||
bst_bin_t max_bin_per_feat)
|
||||
: row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
|
||||
hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
|
||||
: row_ptr(info.num_row_ + 1, 0),
|
||||
hit_count(cuts.TotalBins(), 0),
|
||||
cut{std::forward<common::HistogramCuts>(cuts)},
|
||||
max_numeric_bins_per_feat(max_bin_per_feat),
|
||||
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
||||
@@ -94,10 +95,12 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
|
||||
isDense_{isDense} {
|
||||
CHECK_GE(n_threads, 1);
|
||||
CHECK_EQ(row_ptr.size(), 0);
|
||||
row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
|
||||
|
||||
// The number of threads is pegged to the batch size. If the OMP
|
||||
// block is parallelized on anything other than the batch/block size,
|
||||
// it should be reassigned
|
||||
row_ptr.resize(batch.Size() + 1, 0);
|
||||
const uint32_t nbins = cut.Ptrs().back();
|
||||
hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(n_threads * nbins, 0);
|
||||
|
||||
this->PushBatch(batch, ft, n_threads);
|
||||
@@ -125,45 +128,20 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
|
||||
#undef INSTANTIATION_PUSH
|
||||
|
||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
||||
auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
|
||||
// Must resize instead of allocating a new one. This function is called everytime a
|
||||
// new batch is pushed, and we grow the size accordingly without loosing the data the
|
||||
// previous batches.
|
||||
using T = decltype(t);
|
||||
std::size_t n_bytes = sizeof(T) * n_index;
|
||||
CHECK_GE(n_bytes, this->data.size());
|
||||
|
||||
auto resource = this->data.Resource();
|
||||
decltype(this->data) new_vec;
|
||||
if (!resource) {
|
||||
CHECK(this->data.empty());
|
||||
new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
|
||||
} else {
|
||||
CHECK(resource->Type() == common::ResourceHandler::kMalloc);
|
||||
auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
|
||||
CHECK(malloc_resource);
|
||||
malloc_resource->Resize(n_bytes);
|
||||
|
||||
// gcc-11.3 doesn't work if DataAs is used.
|
||||
std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
|
||||
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
|
||||
}
|
||||
this->data = std::move(new_vec);
|
||||
this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
|
||||
};
|
||||
|
||||
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
||||
isDense) {
|
||||
// compress dense index to uint8
|
||||
make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
|
||||
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
||||
index.Resize((sizeof(uint8_t)) * n_index);
|
||||
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
||||
isDense) {
|
||||
// compress dense index to uint16
|
||||
make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
|
||||
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
||||
index.Resize((sizeof(uint16_t)) * n_index);
|
||||
} else {
|
||||
// no compression
|
||||
make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
|
||||
index.SetBinTypeSize(common::kUint32BinsTypeSize);
|
||||
index.Resize((sizeof(uint32_t)) * n_index);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -236,11 +214,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
|
||||
bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
|
||||
return this->columns_->Read(fi, this->cut.Ptrs().data());
|
||||
}
|
||||
|
||||
std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
|
||||
size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
|
||||
return this->columns_->Write(fo);
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2022-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
*/
|
||||
#include <memory> // std::unique_ptr
|
||||
|
||||
@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
|
||||
}
|
||||
|
||||
void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
|
||||
common::RefResourceView<std::size_t>* p_out) {
|
||||
std::vector<size_t>* p_out) {
|
||||
auto& row_ptr = *p_out;
|
||||
row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
|
||||
row_ptr.resize(page->Size() + 1, 0);
|
||||
if (page->is_dense) {
|
||||
std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
|
||||
} else {
|
||||
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
|
||||
ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
|
||||
}
|
||||
|
||||
this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
|
||||
this->hit_count.resize(n_bins_total, 0);
|
||||
this->GatherHitCount(ctx->Threads(), n_bins_total);
|
||||
|
||||
// sanity checks
|
||||
|
||||
@@ -9,14 +9,13 @@
|
||||
#include <atomic> // for atomic
|
||||
#include <cinttypes> // for uint32_t
|
||||
#include <cstddef> // for size_t
|
||||
#include <memory> // for make_unique
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/error_msg.h" // for InfInData
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/numeric.h"
|
||||
#include "../common/ref_resource_view.h" // for RefResourceView
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
|
||||
#include "adapter.h"
|
||||
@@ -26,11 +25,9 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
class ColumnMatrix;
|
||||
class AlignedFileWriteStream;
|
||||
} // namespace common
|
||||
|
||||
/**
|
||||
* @brief preprocessed global index matrix, in CSR format.
|
||||
/*!
|
||||
* \brief preprocessed global index matrix, in CSR format
|
||||
*
|
||||
* Transform floating values to integer index in histogram This is a global histogram
|
||||
* index for CPU histogram. On GPU ellpack page is used.
|
||||
@@ -136,22 +133,20 @@ class GHistIndexMatrix {
|
||||
}
|
||||
|
||||
public:
|
||||
/** @brief row pointer to rows by element position */
|
||||
common::RefResourceView<std::size_t> row_ptr;
|
||||
/** @brief data storage for index. */
|
||||
common::RefResourceView<std::uint8_t> data;
|
||||
/** @brief The histogram index. */
|
||||
/*! \brief row pointer to rows by element position */
|
||||
std::vector<size_t> row_ptr;
|
||||
/*! \brief The index data */
|
||||
common::Index index;
|
||||
/** @brief hit count of each index, used for constructing the ColumnMatrix */
|
||||
common::RefResourceView<std::size_t> hit_count;
|
||||
/** @brief The corresponding cuts */
|
||||
/*! \brief hit count of each index, used for constructing the ColumnMatrix */
|
||||
std::vector<size_t> hit_count;
|
||||
/*! \brief The corresponding cuts */
|
||||
common::HistogramCuts cut;
|
||||
/** @brief max_bin for each feature. */
|
||||
/** \brief max_bin for each feature. */
|
||||
bst_bin_t max_numeric_bins_per_feat;
|
||||
/** @brief base row index for current page (used by external memory) */
|
||||
bst_row_t base_rowid{0};
|
||||
/*! \brief base row index for current page (used by external memory) */
|
||||
size_t base_rowid{0};
|
||||
|
||||
[[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
|
||||
bst_bin_t MaxNumBinPerFeat() const {
|
||||
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
|
||||
}
|
||||
|
||||
@@ -223,27 +218,29 @@ class GHistIndexMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] bool IsDense() const { return isDense_; }
|
||||
bool IsDense() const {
|
||||
return isDense_;
|
||||
}
|
||||
void SetDense(bool is_dense) { isDense_ = is_dense; }
|
||||
/**
|
||||
* @brief Get the local row index.
|
||||
* \brief Get the local row index.
|
||||
*/
|
||||
[[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
|
||||
size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
|
||||
|
||||
[[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
|
||||
[[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
|
||||
bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
|
||||
bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
|
||||
|
||||
[[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
|
||||
[[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
|
||||
bool ReadColumnPage(dmlc::SeekStream* fi);
|
||||
size_t WriteColumnPage(dmlc::Stream* fo) const;
|
||||
|
||||
[[nodiscard]] common::ColumnMatrix const& Transpose() const;
|
||||
common::ColumnMatrix const& Transpose() const;
|
||||
|
||||
[[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
|
||||
bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
|
||||
|
||||
[[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
|
||||
[[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
|
||||
std::vector<float> const& values, std::vector<float> const& mins,
|
||||
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
|
||||
float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
|
||||
float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
|
||||
std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
|
||||
bool is_cat) const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<common::ColumnMatrix> columns_;
|
||||
@@ -297,5 +294,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
|
||||
}
|
||||
});
|
||||
}
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_GRADIENT_INDEX_H_
|
||||
|
||||
@@ -1,49 +1,38 @@
|
||||
/**
|
||||
* Copyright 2021-2023 XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2021-2022 XGBoost contributors
|
||||
*/
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint8_t
|
||||
#include <type_traits> // for underlying_type_t
|
||||
#include <vector> // for vector
|
||||
#include "sparse_page_writer.h"
|
||||
#include "gradient_index.h"
|
||||
#include "histogram_cut_format.h"
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream
|
||||
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
|
||||
#include "gradient_index.h" // for GHistIndexMatrix
|
||||
#include "histogram_cut_format.h" // for ReadHistogramCuts
|
||||
#include "sparse_page_writer.h" // for SparsePageFormat
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
||||
public:
|
||||
bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
|
||||
CHECK(fi);
|
||||
|
||||
bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
|
||||
if (!ReadHistogramCuts(&page->cut, fi)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// indptr
|
||||
if (!common::ReadVec(fi, &page->row_ptr)) {
|
||||
fi->Read(&page->row_ptr);
|
||||
// data
|
||||
std::vector<uint8_t> data;
|
||||
if (!fi->Read(&data)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// data
|
||||
// - bin type
|
||||
page->index.Resize(data.size());
|
||||
std::copy(data.cbegin(), data.cend(), page->index.begin());
|
||||
// bin type
|
||||
// Old gcc doesn't support reading from enum.
|
||||
std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
|
||||
if (!fi->Read(&uint_bin_type)) {
|
||||
return false;
|
||||
}
|
||||
common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
|
||||
// - index buffer
|
||||
if (!common::ReadVec(fi, &page->data)) {
|
||||
return false;
|
||||
}
|
||||
// - index
|
||||
page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
|
||||
|
||||
common::BinTypeSize size_type =
|
||||
static_cast<common::BinTypeSize>(uint_bin_type);
|
||||
page->index.SetBinTypeSize(size_type);
|
||||
// hit count
|
||||
if (!common::ReadVec(fi, &page->hit_count)) {
|
||||
if (!fi->Read(&page->hit_count)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
|
||||
@@ -61,34 +50,38 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
||||
page->index.SetBinOffset(page->cut.Ptrs());
|
||||
}
|
||||
|
||||
if (!page->ReadColumnPage(fi)) {
|
||||
return false;
|
||||
}
|
||||
page->ReadColumnPage(fi);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
|
||||
CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
|
||||
std::size_t bytes = 0;
|
||||
size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
|
||||
size_t bytes = 0;
|
||||
bytes += WriteHistogramCuts(page.cut, fo);
|
||||
// indptr
|
||||
bytes += common::WriteVec(fo, page.row_ptr);
|
||||
|
||||
fo->Write(page.row_ptr);
|
||||
bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
|
||||
sizeof(uint64_t);
|
||||
// data
|
||||
// - bin type
|
||||
std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
|
||||
bytes += fo->Write(uint_bin_type);
|
||||
// - index buffer
|
||||
std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
|
||||
bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
|
||||
bytes += fo->Write(data.data(), data.size());
|
||||
|
||||
std::vector<uint8_t> data(page.index.begin(), page.index.end());
|
||||
fo->Write(data);
|
||||
bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
|
||||
// bin type
|
||||
std::underlying_type_t<common::BinTypeSize> uint_bin_type =
|
||||
page.index.GetBinTypeSize();
|
||||
fo->Write(uint_bin_type);
|
||||
bytes += sizeof(page.index.GetBinTypeSize());
|
||||
// hit count
|
||||
bytes += common::WriteVec(fo, page.hit_count);
|
||||
fo->Write(page.hit_count);
|
||||
bytes +=
|
||||
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
|
||||
sizeof(uint64_t);
|
||||
// max_bins, base row, is_dense
|
||||
bytes += fo->Write(page.max_numeric_bins_per_feat);
|
||||
bytes += fo->Write(page.base_rowid);
|
||||
bytes += fo->Write(page.IsDense());
|
||||
fo->Write(page.max_numeric_bins_per_feat);
|
||||
bytes += sizeof(page.max_numeric_bins_per_feat);
|
||||
fo->Write(page.base_rowid);
|
||||
bytes += sizeof(page.base_rowid);
|
||||
fo->Write(page.IsDense());
|
||||
bytes += sizeof(page.IsDense());
|
||||
|
||||
bytes += page.WriteColumnPage(fo);
|
||||
return bytes;
|
||||
@@ -100,4 +93,6 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
|
||||
XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
|
||||
.describe("Raw GHistIndex binary data format.")
|
||||
.set_body([]() { return new GHistIndexRawFormat(); });
|
||||
} // namespace xgboost::data
|
||||
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2021-2022 by XGBoost Contributors
|
||||
*/
|
||||
#include "gradient_index_page_source.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
void GradientIndexPageSource::Fetch() {
|
||||
if (!this->ReadCache()) {
|
||||
if (count_ != 0 && !sync_) {
|
||||
@@ -20,4 +21,5 @@ void GradientIndexPageSource::Fetch() {
|
||||
this->WriteCache();
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,38 +1,36 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
|
||||
#include <dmlc/io.h> // for Stream
|
||||
#include "../common/hist_util.h"
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.h" // for WriteVec, ReadVec
|
||||
|
||||
namespace xgboost::data {
|
||||
inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
|
||||
if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
|
||||
if (!fi->Read(&cuts->cut_values_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
|
||||
if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
|
||||
if (!fi->Read(&cuts->min_vals_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
|
||||
common::AlignedFileWriteStream *fo) {
|
||||
std::size_t bytes = 0;
|
||||
bytes += common::WriteVec(fo, cuts.Values());
|
||||
bytes += common::WriteVec(fo, cuts.Ptrs());
|
||||
bytes += common::WriteVec(fo, cuts.MinValues());
|
||||
inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
|
||||
size_t bytes = 0;
|
||||
fo->Write(cuts.cut_values_.ConstHostVector());
|
||||
bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
|
||||
fo->Write(cuts.cut_ptrs_.ConstHostVector());
|
||||
bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
|
||||
fo->Write(cuts.min_vals_.ConstHostVector());
|
||||
bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
|
||||
return bytes;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
|
||||
@@ -240,9 +240,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
|
||||
* Generate gradient index.
|
||||
*/
|
||||
this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
|
||||
std::size_t rbegin = 0;
|
||||
std::size_t prev_sum = 0;
|
||||
std::size_t i = 0;
|
||||
size_t rbegin = 0;
|
||||
size_t prev_sum = 0;
|
||||
size_t i = 0;
|
||||
while (iter.Next()) {
|
||||
HostAdapterDispatch(proxy, [&](auto const& batch) {
|
||||
proxy->Info().num_nonzero_ = batch_nnz[i];
|
||||
|
||||
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
auto num_rows = [&]() {
|
||||
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
|
||||
return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
|
||||
};
|
||||
auto num_cols = [&]() {
|
||||
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
|
||||
return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
|
||||
};
|
||||
|
||||
size_t row_stride = 0;
|
||||
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
get_device());
|
||||
auto* p_sketch = &sketch_containers.back();
|
||||
proxy->Info().weights_.SetDevice(get_device());
|
||||
cuda_impl::Dispatch(proxy, [&](auto const& value) {
|
||||
Dispatch(proxy, [&](auto const& value) {
|
||||
common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
|
||||
});
|
||||
}
|
||||
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
accumulated_rows += batch_rows;
|
||||
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
|
||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||
row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
|
||||
row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
|
||||
return GetRowCounts(value, row_counts_span, get_device(), missing);
|
||||
}));
|
||||
nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
|
||||
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
auto rows = num_rows();
|
||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||
cuda_impl::Dispatch(proxy, [=](auto const& value) {
|
||||
Dispatch(proxy, [=](auto const& value) {
|
||||
return GetRowCounts(value, row_counts_span, get_device(), missing);
|
||||
});
|
||||
auto is_dense = this->IsDense();
|
||||
|
||||
proxy->Info().feature_types.SetDevice(get_device());
|
||||
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
|
||||
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
|
||||
auto new_impl = Dispatch(proxy, [&](auto const& value) {
|
||||
return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
|
||||
d_feature_types, row_stride, rows, cuts);
|
||||
});
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2021 by Contributors
|
||||
* \file proxy_dmatrix.cc
|
||||
*/
|
||||
|
||||
#include "proxy_dmatrix.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
void DMatrixProxy::SetArrayData(StringView interface_str) {
|
||||
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
void DMatrixProxy::SetArrayData(char const *c_interface) {
|
||||
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
|
||||
this->batch_ = adapter;
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
@@ -24,36 +25,5 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
this->ctx_.gpu_id = Context::kCpuId;
|
||||
}
|
||||
|
||||
namespace cuda_impl {
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
|
||||
std::shared_ptr<DMatrixProxy> proxy, float missing);
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
|
||||
float) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
} // namespace cuda_impl
|
||||
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
|
||||
std::shared_ptr<DMatrixProxy> proxy,
|
||||
float missing) {
|
||||
bool type_error{false};
|
||||
std::shared_ptr<DMatrix> p_fmat{nullptr};
|
||||
if (proxy->Ctx()->IsCPU()) {
|
||||
p_fmat = data::HostAdapterDispatch<false>(
|
||||
proxy.get(),
|
||||
[&](auto const &adapter) {
|
||||
auto p_fmat =
|
||||
std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
|
||||
return p_fmat;
|
||||
},
|
||||
&type_error);
|
||||
} else {
|
||||
p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
|
||||
}
|
||||
|
||||
return p_fmat;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Copyright 2020-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2020-2022, XGBoost contributors
|
||||
*/
|
||||
#include "device_adapter.cuh"
|
||||
#include "proxy_dmatrix.cuh"
|
||||
#include "proxy_dmatrix.h"
|
||||
#include "device_adapter.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
|
||||
std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
|
||||
auto const& value = adapter->Value();
|
||||
@@ -30,15 +31,5 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
||||
ctx_.gpu_id = dh::CurrentDevice();
|
||||
}
|
||||
}
|
||||
|
||||
namespace cuda_impl {
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
|
||||
std::shared_ptr<DMatrixProxy> proxy,
|
||||
float missing) {
|
||||
return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
|
||||
auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
|
||||
return p_fmat;
|
||||
});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -6,34 +6,19 @@
|
||||
#include "device_adapter.cuh"
|
||||
#include "proxy_dmatrix.h"
|
||||
|
||||
namespace xgboost::data::cuda_impl {
|
||||
template <bool get_value = true, typename Fn>
|
||||
namespace xgboost::data {
|
||||
template <typename Fn>
|
||||
decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
|
||||
if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
|
||||
if constexpr (get_value) {
|
||||
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
|
||||
return fn(value);
|
||||
}
|
||||
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
|
||||
if constexpr (get_value) {
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
|
||||
return fn(value);
|
||||
}
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
|
||||
if constexpr (get_value) {
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
|
||||
return fn(value);
|
||||
}
|
||||
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::data::cuda_impl
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
void SetArrayData(StringView interface_str);
|
||||
void SetArrayData(char const* c_interface);
|
||||
void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
|
||||
bst_feature_t n_features, bool on_host);
|
||||
|
||||
@@ -114,62 +114,28 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
|
||||
return typed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Dispatch function call based on input type.
|
||||
*
|
||||
* @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
|
||||
* @tparam Fn The type of the function to be dispatched.
|
||||
*
|
||||
* @param proxy The proxy object holding the reference to the input.
|
||||
* @param fn The function to be dispatched.
|
||||
* @param type_error[out] Set to ture if it's not null and the input data is not recognized by
|
||||
* the host.
|
||||
*
|
||||
* @return The return value of the function being dispatched.
|
||||
*/
|
||||
template <bool get_value = true, typename Fn>
|
||||
template <typename Fn>
|
||||
decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
|
||||
if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
|
||||
if constexpr (get_value) {
|
||||
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
|
||||
return fn(value);
|
||||
}
|
||||
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
|
||||
if (type_error) {
|
||||
*type_error = false;
|
||||
}
|
||||
return fn(value);
|
||||
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
|
||||
if constexpr (get_value) {
|
||||
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
} else {
|
||||
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
|
||||
return fn(value);
|
||||
}
|
||||
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
|
||||
if (type_error) {
|
||||
*type_error = false;
|
||||
}
|
||||
return fn(value);
|
||||
} else {
|
||||
if (type_error) {
|
||||
*type_error = true;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
|
||||
}
|
||||
if constexpr (get_value) {
|
||||
return std::result_of_t<Fn(
|
||||
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
|
||||
} else {
|
||||
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
|
||||
}
|
||||
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
|
||||
*/
|
||||
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
|
||||
std::shared_ptr<DMatrixProxy> proxy, float missing);
|
||||
} // namespace xgboost::data
|
||||
#endif // XGBOOST_DATA_PROXY_DMATRIX_H_
|
||||
|
||||
@@ -21,7 +21,8 @@
|
||||
#include "xgboost/c_api.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
MetaInfo& SimpleDMatrix::Info() { return info_; }
|
||||
|
||||
const MetaInfo& SimpleDMatrix::Info() const { return info_; }
|
||||
@@ -96,10 +97,6 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
|
||||
BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
|
||||
// column page doesn't exist, generate it
|
||||
if (!column_page_) {
|
||||
auto n = std::numeric_limits<decltype(Entry::index)>::max();
|
||||
if (this->sparse_page_->Size() > n) {
|
||||
error::MaxSampleSize(n);
|
||||
}
|
||||
column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
|
||||
}
|
||||
auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
|
||||
@@ -109,10 +106,6 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
|
||||
BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
|
||||
// Sorted column page doesn't exist, generate it
|
||||
if (!sorted_column_page_) {
|
||||
auto n = std::numeric_limits<decltype(Entry::index)>::max();
|
||||
if (this->sparse_page_->Size() > n) {
|
||||
error::MaxSampleSize(n);
|
||||
}
|
||||
sorted_column_page_.reset(
|
||||
new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
|
||||
sorted_column_page_->SortRows(ctx->Threads());
|
||||
@@ -434,4 +427,5 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
|
||||
|
||||
fmat_ctx_ = ctx;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
#ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
#define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
@@ -17,33 +20,35 @@
|
||||
#include "ellpack_page_source.h"
|
||||
#include "gradient_index_page_source.h"
|
||||
#include "sparse_page_source.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
/**
|
||||
* \brief DMatrix used for external memory.
|
||||
*
|
||||
* The external memory is created for controlling memory usage by splitting up data into
|
||||
* multiple batches. However that doesn't mean we will actually process exactly 1 batch
|
||||
* at a time, which would be terribly slow considering that we have to loop through the
|
||||
* whole dataset for every tree split. So we use async to pre-fetch pages and let the
|
||||
* caller to decide how many batches it wants to process by returning data as a shared
|
||||
* pointer. The caller can use async function to process the data or just stage those
|
||||
* batches based on its use cases. These two optimizations might defeat the purpose of
|
||||
* splitting up dataset since if you stage all the batches then the memory usage might be
|
||||
* even worse than using a single batch. As a result, we must control how many batches can
|
||||
* be in memory at any given time.
|
||||
* multiple batches. However that doesn't mean we will actually process exact 1 batch at
|
||||
* a time, which would be terribly slow considering that we have to loop through the
|
||||
* whole dataset for every tree split. So we use async pre-fetch and let caller to decide
|
||||
* how many batches it wants to process by returning data as shared pointer. The caller
|
||||
* can use async function to process the data or just stage those batches, making the
|
||||
* decision is out of the scope for sparse page dmatrix. These 2 optimizations might
|
||||
* defeat the purpose of splitting up dataset since if you load all the batches then the
|
||||
* memory usage is even worse than using a single batch. Essentially we need to control
|
||||
* how many batches can be in memory at the same time.
|
||||
*
|
||||
* Right now the write to the cache is a sequential operation and is blocking. Reading
|
||||
* from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
|
||||
* heuristic. So by sparse dmatrix itself there can be only 7 pages in main memory (might
|
||||
* be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
|
||||
* pages, 3 pre-fetched dependent pages.
|
||||
* Right now the write to the cache is sequential operation and is blocking, reading from
|
||||
* cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse
|
||||
* dmatrix itself there can be only 9 pages in main memory (might be of different types)
|
||||
* at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
|
||||
* dependent pages. If the caller stops iteration at the middle and start again, then the
|
||||
* number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
|
||||
* caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
|
||||
* sampling algo that samples only the first portion of data).
|
||||
*
|
||||
* Of course if the caller decides to retain some batches to perform parallel processing,
|
||||
* then we might load all pages in memory, which is also considered as a bug in caller's
|
||||
* code. So if the algo supports external memory, it must be careful that queue for async
|
||||
* code. So if the algo supports external memory, it must be careful that queue for async
|
||||
* call must have an upper limit.
|
||||
*
|
||||
* Another assumption we make is that the data must be immutable so caller should never
|
||||
@@ -96,7 +101,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
MetaInfo &Info() override;
|
||||
const MetaInfo &Info() const override;
|
||||
Context const *Ctx() const override { return &fmat_ctx_; }
|
||||
// The only DMatrix implementation that returns false.
|
||||
|
||||
bool SingleColBlock() const override { return false; }
|
||||
DMatrix *Slice(common::Span<int32_t const>) override {
|
||||
LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
|
||||
@@ -148,5 +153,6 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
|
||||
}
|
||||
return id;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
|
||||
@@ -1,57 +1,59 @@
|
||||
/**
|
||||
* Copyright 2015-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright (c) 2015-2021 by Contributors
|
||||
* \file sparse_page_raw_format.cc
|
||||
* Raw binary format of sparse page.
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.h" // for WriteVec
|
||||
#include "./sparse_page_writer.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "./sparse_page_writer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
class SparsePageRawFormat : public SparsePageFormat<T> {
|
||||
public:
|
||||
bool Read(T* page, common::AlignedResourceReadStream* fi) override {
|
||||
bool Read(T* page, dmlc::SeekStream* fi) override {
|
||||
auto& offset_vec = page->offset.HostVector();
|
||||
if (!common::ReadVec(fi, &offset_vec)) {
|
||||
if (!fi->Read(&offset_vec)) {
|
||||
return false;
|
||||
}
|
||||
auto& data_vec = page->data.HostVector();
|
||||
CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
|
||||
data_vec.resize(offset_vec.back());
|
||||
if (page->data.Size() != 0) {
|
||||
if (!common::ReadVec(fi, &data_vec)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
|
||||
return false;
|
||||
size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
|
||||
(page->data).Size() * sizeof(Entry));
|
||||
CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
}
|
||||
fi->Read(&page->base_rowid, sizeof(page->base_rowid));
|
||||
return true;
|
||||
}
|
||||
|
||||
std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
|
||||
size_t Write(const T& page, dmlc::Stream* fo) override {
|
||||
const auto& offset_vec = page.offset.HostVector();
|
||||
const auto& data_vec = page.data.HostVector();
|
||||
CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
|
||||
CHECK_EQ(offset_vec.back(), page.data.Size());
|
||||
|
||||
std::size_t bytes{0};
|
||||
bytes += common::WriteVec(fo, offset_vec);
|
||||
fo->Write(offset_vec);
|
||||
auto bytes = page.MemCostBytes();
|
||||
bytes += sizeof(uint64_t);
|
||||
if (page.data.Size() != 0) {
|
||||
bytes += common::WriteVec(fo, data_vec);
|
||||
fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
|
||||
}
|
||||
bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
|
||||
fo->Write(&page.base_rowid, sizeof(page.base_rowid));
|
||||
bytes += sizeof(page.base_rowid);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief external memory column offset */
|
||||
std::vector<size_t> disk_offset_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
|
||||
@@ -72,4 +74,5 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
|
||||
return new SparsePageRawFormat<SortedCSCPage>();
|
||||
});
|
||||
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,31 +1,33 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*/
|
||||
#include "../common/device_helpers.cuh" // for CurrentDevice
|
||||
#include "proxy_dmatrix.cuh" // for Dispatch, DMatrixProxy
|
||||
#include "simple_dmatrix.cuh" // for CopyToSparsePage
|
||||
#include "sparse_page_source.h"
|
||||
#include "xgboost/data.h" // for SparsePage
|
||||
#include "proxy_dmatrix.cuh"
|
||||
#include "simple_dmatrix.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace detail {
|
||||
std::size_t NSamplesDevice(DMatrixProxy *proxy) {
|
||||
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
|
||||
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
|
||||
}
|
||||
|
||||
std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
|
||||
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
|
||||
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
|
||||
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
|
||||
auto device = proxy->DeviceIdx();
|
||||
if (device < 0) {
|
||||
device = dh::CurrentDevice();
|
||||
}
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
cuda_impl::Dispatch(proxy,
|
||||
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
|
||||
Dispatch(proxy, [&](auto const &value) {
|
||||
CopyToSparsePage(value, device, missing, page);
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,49 +1,45 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2014-2022 by XGBoost Contributors
|
||||
* \file sparse_page_source.h
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||
|
||||
#include <algorithm> // for min
|
||||
#include <atomic> // for atomic
|
||||
#include <future> // for async
|
||||
#include <algorithm> // std::min
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <future>
|
||||
#include <thread>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex> // for mutex
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <utility> // for pair, move
|
||||
#include <vector>
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/io.h" // for PrivateMmapConstStream
|
||||
#include "../common/timer.h" // for Monitor, Timer
|
||||
#include "adapter.h"
|
||||
#include "proxy_dmatrix.h" // for DMatrixProxy
|
||||
#include "sparse_page_writer.h" // for SparsePageFormat
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
#include "adapter.h"
|
||||
#include "sparse_page_writer.h"
|
||||
#include "proxy_dmatrix.h"
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/timer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
inline void TryDeleteCacheFile(const std::string& file) {
|
||||
if (std::remove(file.c_str()) != 0) {
|
||||
// Don't throw, this is called in a destructor.
|
||||
LOG(WARNING) << "Couldn't remove external memory cache file " << file
|
||||
<< "; you may want to remove it manually";
|
||||
<< "; you may want to remove it manually";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Information about the cache including path and page offsets.
|
||||
*/
|
||||
struct Cache {
|
||||
// whether the write to the cache is complete
|
||||
bool written;
|
||||
std::string name;
|
||||
std::string format;
|
||||
// offset into binary cache file.
|
||||
std::vector<std::uint64_t> offset;
|
||||
std::vector<size_t> offset;
|
||||
|
||||
Cache(bool w, std::string n, std::string fmt)
|
||||
: written{w}, name{std::move(n)}, format{std::move(fmt)} {
|
||||
@@ -55,24 +51,11 @@ struct Cache {
|
||||
return name + format;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::string ShardName() const {
|
||||
std::string ShardName() {
|
||||
return ShardName(this->name, this->format);
|
||||
}
|
||||
/**
|
||||
* @brief Record a page with size of n_bytes.
|
||||
*/
|
||||
void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
|
||||
/**
|
||||
* @brief Returns the view start and length for the i^th page.
|
||||
*/
|
||||
[[nodiscard]] auto View(std::size_t i) const {
|
||||
std::uint64_t off = offset.at(i);
|
||||
std::uint64_t len = offset.at(i + 1) - offset[i];
|
||||
return std::pair{off, len};
|
||||
}
|
||||
/**
|
||||
* @brief Call this once the write for the cache is complete.
|
||||
*/
|
||||
|
||||
// The write is completed.
|
||||
void Commit() {
|
||||
if (!written) {
|
||||
std::partial_sum(offset.begin(), offset.end(), offset.begin());
|
||||
@@ -81,7 +64,7 @@ struct Cache {
|
||||
}
|
||||
};
|
||||
|
||||
// Prevents multi-threaded call to `GetBatches`.
|
||||
// Prevents multi-threaded call.
|
||||
class TryLockGuard {
|
||||
std::mutex& lock_;
|
||||
|
||||
@@ -94,128 +77,74 @@ class TryLockGuard {
|
||||
}
|
||||
};
|
||||
|
||||
// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
|
||||
class ExceHandler {
|
||||
std::mutex mutex_;
|
||||
std::atomic<bool> flag_{false};
|
||||
std::exception_ptr curr_exce_{nullptr};
|
||||
|
||||
public:
|
||||
template <typename Fn>
|
||||
decltype(auto) Run(Fn&& fn) noexcept(true) {
|
||||
try {
|
||||
return fn();
|
||||
} catch (dmlc::Error const& e) {
|
||||
std::lock_guard<std::mutex> guard{mutex_};
|
||||
if (!curr_exce_) {
|
||||
curr_exce_ = std::current_exception();
|
||||
}
|
||||
flag_ = true;
|
||||
} catch (std::exception const& e) {
|
||||
std::lock_guard<std::mutex> guard{mutex_};
|
||||
if (!curr_exce_) {
|
||||
curr_exce_ = std::current_exception();
|
||||
}
|
||||
flag_ = true;
|
||||
} catch (...) {
|
||||
std::lock_guard<std::mutex> guard{mutex_};
|
||||
if (!curr_exce_) {
|
||||
curr_exce_ = std::current_exception();
|
||||
}
|
||||
flag_ = true;
|
||||
}
|
||||
return std::invoke_result_t<Fn>();
|
||||
}
|
||||
|
||||
void Rethrow() noexcept(false) {
|
||||
if (flag_) {
|
||||
CHECK(curr_exce_);
|
||||
std::rethrow_exception(curr_exce_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Base class for all page sources. Handles fetching, writing, and iteration.
|
||||
*/
|
||||
template <typename S>
|
||||
class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
protected:
|
||||
// Prevents calling this iterator from multiple places(or threads).
|
||||
std::mutex single_threaded_;
|
||||
// The current page.
|
||||
|
||||
std::shared_ptr<S> page_;
|
||||
|
||||
bool at_end_ {false};
|
||||
float missing_;
|
||||
std::int32_t nthreads_;
|
||||
int nthreads_;
|
||||
bst_feature_t n_features_;
|
||||
// Index to the current page.
|
||||
std::uint32_t count_{0};
|
||||
// Total number of batches.
|
||||
std::uint32_t n_batches_{0};
|
||||
|
||||
uint32_t count_{0};
|
||||
|
||||
uint32_t n_batches_ {0};
|
||||
|
||||
std::shared_ptr<Cache> cache_info_;
|
||||
std::unique_ptr<dmlc::Stream> fo_;
|
||||
|
||||
using Ring = std::vector<std::future<std::shared_ptr<S>>>;
|
||||
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
|
||||
// can pre-fetch data in a ring.
|
||||
std::unique_ptr<Ring> ring_{new Ring};
|
||||
// Catching exception in pre-fetch threads to prevent segfault. Not always work though,
|
||||
// OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
|
||||
// OOM error should be rare.
|
||||
ExceHandler exce_;
|
||||
common::Monitor monitor_;
|
||||
|
||||
bool ReadCache() {
|
||||
CHECK(!at_end_);
|
||||
if (!cache_info_->written) {
|
||||
return false;
|
||||
}
|
||||
if (ring_->empty()) {
|
||||
if (fo_) {
|
||||
fo_.reset(); // flush the data to disk.
|
||||
ring_->resize(n_batches_);
|
||||
}
|
||||
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
|
||||
// to let user adjust number of pre-fetched batches when needed.
|
||||
uint32_t constexpr kPreFetch = 3;
|
||||
uint32_t constexpr kPreFetch = 4;
|
||||
|
||||
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
|
||||
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
|
||||
std::size_t fetch_it = count_;
|
||||
size_t fetch_it = count_;
|
||||
|
||||
exce_.Rethrow();
|
||||
|
||||
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||
for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||
fetch_it %= n_batches_; // ring
|
||||
if (ring_->at(fetch_it).valid()) {
|
||||
continue;
|
||||
}
|
||||
auto const* self = this; // make sure it's const
|
||||
auto const *self = this; // make sure it's const
|
||||
CHECK_LT(fetch_it, cache_info_->offset.size());
|
||||
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
|
||||
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
|
||||
common::Timer timer;
|
||||
timer.Start();
|
||||
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||
auto n = self->cache_info_->ShardName();
|
||||
size_t offset = self->cache_info_->offset.at(fetch_it);
|
||||
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
|
||||
fi->Seek(offset);
|
||||
CHECK_EQ(fi->Tell(), offset);
|
||||
auto page = std::make_shared<S>();
|
||||
this->exce_.Run([&] {
|
||||
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||
auto name = self->cache_info_->ShardName();
|
||||
auto [offset, length] = self->cache_info_->View(fetch_it);
|
||||
auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
|
||||
CHECK(fmt->Read(page.get(), fi.get()));
|
||||
});
|
||||
CHECK(fmt->Read(page.get(), fi.get()));
|
||||
LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
|
||||
return page;
|
||||
});
|
||||
}
|
||||
|
||||
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
|
||||
n_prefetch_batches)
|
||||
<< "Sparse DMatrix assumes forward iteration.";
|
||||
|
||||
monitor_.Start("Wait");
|
||||
page_ = (*ring_)[count_].get();
|
||||
CHECK(!(*ring_)[count_].valid());
|
||||
monitor_.Stop("Wait");
|
||||
|
||||
exce_.Rethrow();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -224,41 +153,29 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
common::Timer timer;
|
||||
timer.Start();
|
||||
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||
|
||||
auto name = cache_info_->ShardName();
|
||||
std::unique_ptr<common::AlignedFileWriteStream> fo;
|
||||
if (this->Iter() == 0) {
|
||||
fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
|
||||
} else {
|
||||
fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
|
||||
if (!fo_) {
|
||||
auto n = cache_info_->ShardName();
|
||||
fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
|
||||
}
|
||||
|
||||
auto bytes = fmt->Write(*page_, fo.get());
|
||||
|
||||
auto bytes = fmt->Write(*page_, fo_.get());
|
||||
timer.Stop();
|
||||
// Not entirely accurate, the kernels doesn't have to flush the data.
|
||||
|
||||
LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
|
||||
<< timer.ElapsedSeconds() << " seconds.";
|
||||
cache_info_->Push(bytes);
|
||||
cache_info_->offset.push_back(bytes);
|
||||
}
|
||||
|
||||
virtual void Fetch() = 0;
|
||||
|
||||
public:
|
||||
SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
|
||||
std::shared_ptr<Cache> cache)
|
||||
: missing_{missing},
|
||||
nthreads_{nthreads},
|
||||
n_features_{n_features},
|
||||
n_batches_{n_batches},
|
||||
cache_info_{std::move(cache)} {
|
||||
monitor_.Init(typeid(S).name()); // not pretty, but works for basic profiling
|
||||
}
|
||||
SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
|
||||
uint32_t n_batches, std::shared_ptr<Cache> cache)
|
||||
: missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
|
||||
n_batches_{n_batches}, cache_info_{std::move(cache)} {}
|
||||
|
||||
SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
|
||||
|
||||
~SparsePageSourceImpl() override {
|
||||
// Don't orphan the threads.
|
||||
for (auto& fu : *ring_) {
|
||||
if (fu.valid()) {
|
||||
fu.get();
|
||||
@@ -266,18 +183,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] uint32_t Iter() const { return count_; }
|
||||
uint32_t Iter() const { return count_; }
|
||||
|
||||
const S &operator*() const override {
|
||||
CHECK(page_);
|
||||
return *page_;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::shared_ptr<S const> Page() const override {
|
||||
std::shared_ptr<S const> Page() const override {
|
||||
return page_;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool AtEnd() const override {
|
||||
bool AtEnd() const override {
|
||||
return at_end_;
|
||||
}
|
||||
|
||||
@@ -285,23 +202,20 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
TryLockGuard guard{single_threaded_};
|
||||
at_end_ = false;
|
||||
count_ = 0;
|
||||
// Pre-fetch for the next round of iterations.
|
||||
this->Fetch();
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
// Push data from CUDA.
|
||||
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
|
||||
#else
|
||||
inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
|
||||
#endif
|
||||
|
||||
class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||
// This is the source from the user.
|
||||
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
|
||||
DMatrixProxy* proxy_;
|
||||
std::size_t base_row_id_{0};
|
||||
size_t base_row_id_ {0};
|
||||
|
||||
void Fetch() final {
|
||||
page_ = std::make_shared<SparsePage>();
|
||||
@@ -330,7 +244,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||
iter_{iter}, proxy_{proxy} {
|
||||
if (!cache_info_->written) {
|
||||
iter_.Reset();
|
||||
CHECK(iter_.Next()) << "Must have at least 1 batch.";
|
||||
CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
|
||||
}
|
||||
this->Fetch();
|
||||
}
|
||||
@@ -345,7 +259,6 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||
}
|
||||
|
||||
if (at_end_) {
|
||||
CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
|
||||
cache_info_->Commit();
|
||||
if (n_batches_ != 0) {
|
||||
CHECK_EQ(count_, n_batches_);
|
||||
@@ -458,5 +371,6 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
|
||||
this->Fetch();
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||
|
||||
@@ -1,44 +1,52 @@
|
||||
/**
|
||||
* Copyright 2014-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright (c) 2014-2019 by Contributors
|
||||
* \file sparse_page_writer.h
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
|
||||
#define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
|
||||
|
||||
#include <functional> // for function
|
||||
#include <string> // for string
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/io.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "dmlc/io.h" // for Stream
|
||||
#include "dmlc/registry.h" // for Registry, FunctionRegEntryBase
|
||||
#include "xgboost/data.h" // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
#include <dmlc/concurrency.h>
|
||||
#include <thread>
|
||||
#endif // DMLC_ENABLE_STD_THREAD
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
template<typename T>
|
||||
struct SparsePageFormatReg;
|
||||
|
||||
/**
|
||||
* @brief Format specification of various data formats like SparsePage.
|
||||
/*!
|
||||
* \brief Format specification of SparsePage.
|
||||
*/
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
class SparsePageFormat {
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~SparsePageFormat() = default;
|
||||
/**
|
||||
* @brief Load all the segments into page, advance fi to end of the block.
|
||||
*
|
||||
* @param page The data to read page into.
|
||||
* @param fi the input stream of the file
|
||||
* @return true of the loading as successful, false if end of file was reached
|
||||
/*!
|
||||
* \brief Load all the segments into page, advance fi to end of the block.
|
||||
* \param page The data to read page into.
|
||||
* \param fi the input stream of the file
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
|
||||
/**
|
||||
* @brief save the data to fo, when a page was written.
|
||||
*
|
||||
* @param fo output stream
|
||||
virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
|
||||
/*!
|
||||
* \brief save the data to fo, when a page was written.
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
|
||||
virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -97,5 +105,6 @@ struct SparsePageFormatReg
|
||||
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>, \
|
||||
GHistIndexPageFmt, Name)
|
||||
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
|
||||
|
||||
@@ -172,7 +172,8 @@ class GBLinear : public GradientBooster {
|
||||
}
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
|
||||
uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
|
||||
unsigned) override {
|
||||
model_.LazyInitModel();
|
||||
LinearCheckLayer(layer_begin);
|
||||
auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
|
||||
@@ -209,8 +210,8 @@ class GBLinear : public GradientBooster {
|
||||
}
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
|
||||
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
|
||||
unsigned layer_begin, unsigned /*layer_end*/,
|
||||
bool) override {
|
||||
LinearCheckLayer(layer_begin);
|
||||
std::vector<bst_float>& contribs = out_contribs->HostVector();
|
||||
|
||||
@@ -18,11 +18,9 @@
|
||||
#include <vector>
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/error_msg.h" // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
|
||||
#include "../common/random.h"
|
||||
#include "../common/threading_utils.h"
|
||||
#include "../common/timer.h"
|
||||
#include "../data/proxy_dmatrix.h" // for DMatrixProxy, HostAdapterDispatch
|
||||
#include "gbtree_model.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
@@ -41,6 +39,7 @@ namespace xgboost::gbm {
|
||||
DMLC_REGISTRY_FILE_TAG(gbtree);
|
||||
|
||||
void GBTree::Configure(Args const& cfg) {
|
||||
this->cfg_ = cfg;
|
||||
std::string updater_seq = tparam_.updater_seq;
|
||||
tparam_.UpdateAllowUnknown(cfg);
|
||||
tree_param_.UpdateAllowUnknown(cfg);
|
||||
@@ -60,8 +59,9 @@ void GBTree::Configure(Args const& cfg) {
|
||||
cpu_predictor_->Configure(cfg);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
auto n_gpus = common::AllVisibleGPUs();
|
||||
if (!gpu_predictor_) {
|
||||
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
|
||||
if (!gpu_predictor_ && n_gpus != 0) {
|
||||
gpu_predictor_ = std::unique_ptr<Predictor>(
|
||||
Predictor::Create("gpu_predictor", this->ctx_));
|
||||
}
|
||||
if (n_gpus != 0) {
|
||||
gpu_predictor_->Configure(cfg);
|
||||
@@ -78,9 +78,10 @@ void GBTree::Configure(Args const& cfg) {
|
||||
|
||||
monitor_.Init("GBTree");
|
||||
|
||||
specified_updater_ = std::any_of(
|
||||
cfg.cbegin(), cfg.cend(),
|
||||
[](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
|
||||
specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
|
||||
[](std::pair<std::string, std::string> const& arg) {
|
||||
return arg.first == "updater";
|
||||
});
|
||||
|
||||
if (specified_updater_ && !showed_updater_warning_) {
|
||||
LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
|
||||
@@ -92,19 +93,12 @@ void GBTree::Configure(Args const& cfg) {
|
||||
showed_updater_warning_ = true;
|
||||
}
|
||||
|
||||
if (model_.learner_model_param->IsVectorLeaf()) {
|
||||
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
|
||||
<< "Only the hist tree method is supported for building multi-target trees with vector "
|
||||
"leaf.";
|
||||
}
|
||||
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
|
||||
this->ConfigureUpdaters();
|
||||
|
||||
if (updater_seq != tparam_.updater_seq) {
|
||||
updaters_.clear();
|
||||
this->InitUpdater(cfg);
|
||||
} else {
|
||||
for (auto& up : updaters_) {
|
||||
for (auto &up : updaters_) {
|
||||
up->Configure(cfg);
|
||||
}
|
||||
}
|
||||
@@ -112,6 +106,66 @@ void GBTree::Configure(Args const& cfg) {
|
||||
configured_ = true;
|
||||
}
|
||||
|
||||
// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
|
||||
// whether external memory is used and how large is dataset. We can remove the dependency
|
||||
// on DMatrix once `hist` tree method can handle external memory so that we can make it
|
||||
// default.
|
||||
void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
|
||||
CHECK(this->configured_);
|
||||
std::string updater_seq = tparam_.updater_seq;
|
||||
CHECK(tparam_.GetInitialised());
|
||||
|
||||
tparam_.UpdateAllowUnknown(cfg);
|
||||
|
||||
this->PerformTreeMethodHeuristic(fmat);
|
||||
this->ConfigureUpdaters();
|
||||
|
||||
// initialize the updaters only when needed.
|
||||
if (updater_seq != tparam_.updater_seq) {
|
||||
LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
|
||||
this->updaters_.clear();
|
||||
this->InitUpdater(cfg);
|
||||
}
|
||||
}
|
||||
|
||||
void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
|
||||
if (specified_updater_) {
|
||||
// This method is disabled when `updater` parameter is explicitly
|
||||
// set, since only experts are expected to do so.
|
||||
return;
|
||||
}
|
||||
if (model_.learner_model_param->IsVectorLeaf()) {
|
||||
CHECK(tparam_.tree_method == TreeMethod::kHist)
|
||||
<< "Only the hist tree method is supported for building multi-target trees with vector "
|
||||
"leaf.";
|
||||
}
|
||||
|
||||
// tparam_ is set before calling this function.
|
||||
if (tparam_.tree_method != TreeMethod::kAuto) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (collective::IsDistributed()) {
|
||||
LOG(INFO) << "Tree method is automatically selected to be 'approx' "
|
||||
"for distributed training.";
|
||||
tparam_.tree_method = TreeMethod::kApprox;
|
||||
} else if (!fmat->SingleColBlock()) {
|
||||
LOG(INFO) << "Tree method is automatically set to 'approx' "
|
||||
"since external-memory data matrix is used.";
|
||||
tparam_.tree_method = TreeMethod::kApprox;
|
||||
} else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
|
||||
/* Choose tree_method='approx' automatically for large data matrix */
|
||||
LOG(INFO) << "Tree method is automatically selected to be "
|
||||
"'approx' for faster speed. To use old behavior "
|
||||
"(exact greedy algorithm on single machine), "
|
||||
"set tree_method to 'exact'.";
|
||||
tparam_.tree_method = TreeMethod::kApprox;
|
||||
} else {
|
||||
tparam_.tree_method = TreeMethod::kExact;
|
||||
}
|
||||
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
|
||||
}
|
||||
|
||||
void GBTree::ConfigureUpdaters() {
|
||||
if (specified_updater_) {
|
||||
return;
|
||||
@@ -119,25 +173,31 @@ void GBTree::ConfigureUpdaters() {
|
||||
// `updater` parameter was manually specified
|
||||
/* Choose updaters according to tree_method parameters */
|
||||
switch (tparam_.tree_method) {
|
||||
case TreeMethod::kAuto: // Use hist as default in 2.0
|
||||
case TreeMethod::kHist: {
|
||||
tparam_.updater_seq = "grow_quantile_histmaker";
|
||||
case TreeMethod::kAuto:
|
||||
// Use heuristic to choose between 'exact' and 'approx' This
|
||||
// choice is carried out in PerformTreeMethodHeuristic() before
|
||||
// calling this function.
|
||||
break;
|
||||
}
|
||||
case TreeMethod::kApprox:
|
||||
tparam_.updater_seq = "grow_histmaker";
|
||||
break;
|
||||
case TreeMethod::kExact:
|
||||
tparam_.updater_seq = "grow_colmaker,prune";
|
||||
break;
|
||||
case TreeMethod::kHist: {
|
||||
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
|
||||
"grow_quantile_histmaker.";
|
||||
tparam_.updater_seq = "grow_quantile_histmaker";
|
||||
break;
|
||||
}
|
||||
case TreeMethod::kGPUHist: {
|
||||
common::AssertGPUSupport();
|
||||
tparam_.updater_seq = "grow_gpu_hist";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
|
||||
<< ") detected";
|
||||
LOG(FATAL) << "Unknown tree_method ("
|
||||
<< static_cast<int>(tparam_.tree_method) << ") detected";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -193,6 +253,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
|
||||
PredictionCacheEntry* predt, ObjFunction const* obj) {
|
||||
TreesOneIter new_trees;
|
||||
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
|
||||
ConfigureWithKnownData(this->cfg_, p_fmat);
|
||||
monitor_.Start("BoostNewTrees");
|
||||
|
||||
// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
|
||||
@@ -375,7 +436,12 @@ void GBTree::LoadConfig(Json const& in) {
|
||||
// This would cause all trees to be pushed to trees_to_update
|
||||
// e.g. updating a model, then saving and loading it would result in an empty model
|
||||
tparam_.process_type = TreeProcessType::kDefault;
|
||||
std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
|
||||
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
|
||||
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
|
||||
LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
|
||||
"Changing predictor to auto.";
|
||||
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
|
||||
}
|
||||
|
||||
auto msg = StringView{
|
||||
R"(
|
||||
@@ -391,32 +457,19 @@ void GBTree::LoadConfig(Json const& in) {
|
||||
LOG(WARNING) << msg << " Changing `tree_method` to `hist`.";
|
||||
}
|
||||
|
||||
std::vector<Json> updater_seq;
|
||||
if (IsA<Object>(in["updater"])) {
|
||||
// before 2.0
|
||||
error::WarnOldSerialization();
|
||||
for (auto const& kv : get<Object const>(in["updater"])) {
|
||||
auto name = kv.first;
|
||||
auto config = kv.second;
|
||||
config["name"] = name;
|
||||
updater_seq.push_back(config);
|
||||
}
|
||||
} else {
|
||||
// after 2.0
|
||||
auto const& j_updaters = get<Array const>(in["updater"]);
|
||||
updater_seq = j_updaters;
|
||||
}
|
||||
|
||||
auto const& j_updaters = get<Object const>(in["updater"]);
|
||||
updaters_.clear();
|
||||
|
||||
for (auto const& config : updater_seq) {
|
||||
auto name = get<String>(config["name"]);
|
||||
for (auto const& kv : j_updaters) {
|
||||
auto name = kv.first;
|
||||
if (n_gpus == 0 && name == "grow_gpu_hist") {
|
||||
name = "grow_quantile_histmaker";
|
||||
LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
|
||||
}
|
||||
updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
|
||||
updaters_.back()->LoadConfig(config);
|
||||
std::unique_ptr<TreeUpdater> up{
|
||||
TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
|
||||
up->LoadConfig(kv.second);
|
||||
updaters_.push_back(std::move(up));
|
||||
}
|
||||
|
||||
specified_updater_ = get<Boolean>(in["specified_updater"]);
|
||||
@@ -438,14 +491,13 @@ void GBTree::SaveConfig(Json* p_out) const {
|
||||
// language binding doesn't need to know about the forest size.
|
||||
out["gbtree_model_param"] = ToJson(model_.param);
|
||||
|
||||
out["updater"] = Array{};
|
||||
auto& j_updaters = get<Array>(out["updater"]);
|
||||
out["updater"] = Object();
|
||||
|
||||
for (auto const& up : this->updaters_) {
|
||||
Json up_config{Object{}};
|
||||
up_config["name"] = String{up->Name()};
|
||||
up->SaveConfig(&up_config);
|
||||
j_updaters.emplace_back(up_config);
|
||||
auto& j_updaters = out["updater"];
|
||||
for (auto const& up : updaters_) {
|
||||
j_updaters[up->Name()] = Object();
|
||||
auto& j_up = j_updaters[up->Name()];
|
||||
up->SaveConfig(&j_up);
|
||||
}
|
||||
out["specified_updater"] = Boolean{specified_updater_};
|
||||
}
|
||||
@@ -515,8 +567,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
|
||||
out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
|
||||
}
|
||||
|
||||
void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) const {
|
||||
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) {
|
||||
CHECK(configured_);
|
||||
if (layer_end == 0) {
|
||||
layer_end = this->BoostedRounds();
|
||||
@@ -536,7 +588,7 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
|
||||
CHECK_EQ(out_preds->version, 0);
|
||||
}
|
||||
|
||||
auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
|
||||
auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
|
||||
if (out_preds->version == 0) {
|
||||
// out_preds->Size() can be non-zero as it's initialized here before any
|
||||
// tree is built at the 0^th iterator.
|
||||
@@ -556,83 +608,52 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline void MismatchedDevices(Context const* booster, Context const* data) {
|
||||
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
|
||||
<< "is running on: " << booster->DeviceName()
|
||||
<< ", while the input data is on: " << data->DeviceName() << ".\n"
|
||||
<< R"(Potential solutions:
|
||||
- Use a data structure that matches the device ordinal in the booster.
|
||||
- Set the device for booster before call to inplace_predict.
|
||||
)";
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) {
|
||||
// dispatch to const function.
|
||||
this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
|
||||
}
|
||||
|
||||
void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
|
||||
PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
|
||||
bst_layer_t layer_end) const {
|
||||
std::unique_ptr<Predictor> const &
|
||||
GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
|
||||
DMatrix *f_dmat) const {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
|
||||
if (p_m->Ctx()->Device() != this->ctx_->Device()) {
|
||||
MismatchedDevices(this->ctx_, p_m->Ctx());
|
||||
CHECK_EQ(out_preds->version, 0);
|
||||
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
|
||||
CHECK(proxy) << error::InplacePredictProxy();
|
||||
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
|
||||
this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
|
||||
return;
|
||||
}
|
||||
|
||||
bool known_type = this->ctx_->DispatchDevice(
|
||||
[&, begin = tree_begin, end = tree_end] {
|
||||
return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
|
||||
},
|
||||
[&, begin = tree_begin, end = tree_end] {
|
||||
return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
|
||||
});
|
||||
if (!known_type) {
|
||||
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
|
||||
CHECK(proxy) << error::InplacePredictProxy();
|
||||
LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
|
||||
bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
|
||||
CHECK(configured_);
|
||||
|
||||
// Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
|
||||
// prevent data copy.
|
||||
if (f_dmat && !f_dmat->SingleColBlock()) {
|
||||
if (ctx_->IsCPU()) {
|
||||
return cpu_predictor_;
|
||||
} else {
|
||||
common::AssertGPUSupport();
|
||||
if (tparam_.predictor != PredictorType::kAuto) {
|
||||
if (tparam_.predictor == PredictorType::kGPUPredictor) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
|
||||
CHECK(gpu_predictor_);
|
||||
return gpu_predictor_;
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
|
||||
#if defined(XGBOOST_USE_ONEAPI)
|
||||
CHECK(oneapi_predictor_);
|
||||
return oneapi_predictor_;
|
||||
#else
|
||||
common::AssertOneAPISupport();
|
||||
#endif // defined(XGBOOST_USE_ONEAPI)
|
||||
}
|
||||
CHECK(cpu_predictor_);
|
||||
return cpu_predictor_;
|
||||
}
|
||||
|
||||
// Data comes from Device DMatrix.
|
||||
auto is_ellpack =
|
||||
f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
|
||||
auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
|
||||
!f_dmat->PageExists<SparsePage>();
|
||||
// Data comes from device memory, like CuDF or CuPy.
|
||||
auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
|
||||
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
|
||||
auto is_from_device =
|
||||
f_dmat && f_dmat->PageExists<SparsePage>() &&
|
||||
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
|
||||
auto on_device = is_ellpack || is_from_device;
|
||||
|
||||
// Use GPU Predictor if data is already on device and gpu_id is set.
|
||||
if (on_device && ctx_->IsCUDA()) {
|
||||
common::AssertGPUSupport();
|
||||
if (on_device && ctx_->gpu_id >= 0) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
|
||||
CHECK(gpu_predictor_);
|
||||
return gpu_predictor_;
|
||||
#else
|
||||
LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
|
||||
"CUDA support.";
|
||||
return cpu_predictor_;
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
// GPU_Hist by default has prediction cache calculated from quantile values,
|
||||
@@ -644,19 +665,23 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
|
||||
if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
|
||||
// FIXME(trivialfis): Implement a better method for testing whether data
|
||||
// is on device after DMatrix refactoring is done.
|
||||
!on_device && is_training) {
|
||||
!on_device) {
|
||||
CHECK(cpu_predictor_);
|
||||
return cpu_predictor_;
|
||||
}
|
||||
|
||||
if (ctx_->IsCPU()) {
|
||||
return cpu_predictor_;
|
||||
} else {
|
||||
common::AssertGPUSupport();
|
||||
if (tparam_.tree_method == TreeMethod::kGPUHist) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
|
||||
CHECK(gpu_predictor_);
|
||||
return gpu_predictor_;
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
return cpu_predictor_;
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
CHECK(cpu_predictor_);
|
||||
return cpu_predictor_;
|
||||
}
|
||||
|
||||
@@ -771,7 +796,7 @@ class Dart : public GBTree {
|
||||
bool training, unsigned layer_begin,
|
||||
unsigned layer_end) const {
|
||||
CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
|
||||
auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
|
||||
auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
|
||||
CHECK(predictor);
|
||||
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
|
||||
model_);
|
||||
@@ -835,16 +860,15 @@ class Dart : public GBTree {
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
auto n_groups = model_.learner_model_param->num_output_group;
|
||||
|
||||
if (ctx_->Device() != p_fmat->Ctx()->Device()) {
|
||||
MismatchedDevices(ctx_, p_fmat->Ctx());
|
||||
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
|
||||
CHECK(proxy) << error::InplacePredictProxy();
|
||||
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
|
||||
this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Predictor const*> predictors {
|
||||
cpu_predictor_.get(),
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
gpu_predictor_.get()
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
};
|
||||
Predictor const* predictor{nullptr};
|
||||
StringView msg{"Unsupported data type for inplace predict."};
|
||||
|
||||
PredictionCacheEntry predts;
|
||||
if (ctx_->gpu_id != Context::kCpuId) {
|
||||
predts.predictions.SetDevice(ctx_->gpu_id);
|
||||
@@ -853,29 +877,32 @@ class Dart : public GBTree {
|
||||
|
||||
auto predict_impl = [&](size_t i) {
|
||||
predts.predictions.Fill(0);
|
||||
bool success = this->ctx_->DispatchDevice(
|
||||
[&] {
|
||||
return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
|
||||
},
|
||||
[&] {
|
||||
return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
|
||||
});
|
||||
CHECK(success) << msg;
|
||||
if (tparam_.predictor == PredictorType::kAuto) {
|
||||
// Try both predictor implementations
|
||||
bool success = false;
|
||||
for (auto const& p : predictors) {
|
||||
if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
|
||||
success = true;
|
||||
predictor = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
CHECK(success) << msg;
|
||||
} else {
|
||||
predictor = this->GetPredictor().get();
|
||||
bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
|
||||
CHECK(success) << msg << std::endl
|
||||
<< "Current Predictor: "
|
||||
<< (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
|
||||
: "gpu_predictor");
|
||||
}
|
||||
};
|
||||
|
||||
// Inplace predict is not used for training, so no need to drop tree.
|
||||
for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
|
||||
predict_impl(i);
|
||||
if (i == tree_begin) {
|
||||
this->ctx_->DispatchDevice(
|
||||
[&] {
|
||||
this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
|
||||
model_);
|
||||
},
|
||||
[&] {
|
||||
this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
|
||||
model_);
|
||||
});
|
||||
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
|
||||
}
|
||||
// Multiple the tree weight
|
||||
auto w = this->weight_drop_.at(i);
|
||||
@@ -905,24 +932,25 @@ class Dart : public GBTree {
|
||||
std::vector<bst_float> *out_preds,
|
||||
unsigned layer_begin, unsigned layer_end) override {
|
||||
DropTrees(false);
|
||||
auto &predictor = this->GetPredictor(false);
|
||||
auto &predictor = this->GetPredictor();
|
||||
uint32_t _, tree_end;
|
||||
std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
predictor->PredictInstance(inst, out_preds, model_, tree_end);
|
||||
}
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate) override {
|
||||
void PredictContribution(DMatrix* p_fmat,
|
||||
HostDeviceVector<bst_float>* out_contribs,
|
||||
unsigned layer_begin, unsigned layer_end, bool approximate, int,
|
||||
unsigned) override {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
|
||||
approximate);
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate) override {
|
||||
void PredictInteractionContributions(
|
||||
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
|
||||
unsigned layer_begin, unsigned layer_end, bool approximate) override {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2021 by Contributors
|
||||
*/
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "xgboost/context.h"
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost::gbm {
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
|
||||
bst_group_t n_groups, bst_group_t group_id,
|
||||
HostDeviceVector<GradientPair> *out_gpair) {
|
||||
@@ -38,4 +41,5 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
|
||||
out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::gbm
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
121
src/gbm/gbtree.h
121
src/gbm/gbtree.h
@@ -43,18 +43,30 @@ enum class TreeProcessType : int {
|
||||
kDefault = 0,
|
||||
kUpdate = 1
|
||||
};
|
||||
|
||||
enum class PredictorType : int {
|
||||
kAuto = 0,
|
||||
kCPUPredictor,
|
||||
kGPUPredictor,
|
||||
kOneAPIPredictor
|
||||
};
|
||||
} // namespace xgboost
|
||||
|
||||
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
|
||||
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
|
||||
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
namespace xgboost::gbm {
|
||||
/*! \brief training parameters */
|
||||
struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
|
||||
/*! \brief tree updater sequence */
|
||||
std::string updater_seq;
|
||||
/*! \brief type of boosting process to run */
|
||||
TreeProcessType process_type;
|
||||
// predictor type
|
||||
PredictorType predictor;
|
||||
// tree construction method
|
||||
TreeMethod tree_method;
|
||||
// declare parameters
|
||||
@@ -69,6 +81,13 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
|
||||
.describe("Whether to run the normal boosting process that creates new trees,"\
|
||||
" or to update the trees in an existing model.");
|
||||
DMLC_DECLARE_ALIAS(updater_seq, updater);
|
||||
DMLC_DECLARE_FIELD(predictor)
|
||||
.set_default(PredictorType::kAuto)
|
||||
.add_enum("auto", PredictorType::kAuto)
|
||||
.add_enum("cpu_predictor", PredictorType::kCPUPredictor)
|
||||
.add_enum("gpu_predictor", PredictorType::kGPUPredictor)
|
||||
.add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
|
||||
.describe("Predictor algorithm type");
|
||||
DMLC_DECLARE_FIELD(tree_method)
|
||||
.set_default(TreeMethod::kAuto)
|
||||
.add_enum("auto", TreeMethod::kAuto)
|
||||
@@ -173,8 +192,12 @@ class GBTree : public GradientBooster {
|
||||
: GradientBooster{ctx}, model_(booster_config, ctx_) {}
|
||||
|
||||
void Configure(const Args& cfg) override;
|
||||
// Revise `tree_method` and `updater` parameters after seeing the training
|
||||
// data matrix, only useful when tree_method is auto.
|
||||
void PerformTreeMethodHeuristic(DMatrix* fmat);
|
||||
/*! \brief Map `tree_method` parameter to `updater` parameter */
|
||||
void ConfigureUpdaters();
|
||||
void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
|
||||
|
||||
/**
|
||||
* \brief Optionally update the leaf value.
|
||||
@@ -189,11 +212,21 @@ class GBTree : public GradientBooster {
|
||||
void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
|
||||
PredictionCacheEntry* predt, ObjFunction const* obj) override;
|
||||
|
||||
[[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
|
||||
bool UseGPU() const override {
|
||||
return
|
||||
tparam_.predictor == PredictorType::kGPUPredictor ||
|
||||
tparam_.tree_method == TreeMethod::kGPUHist;
|
||||
}
|
||||
|
||||
[[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
|
||||
GBTreeTrainParam const& GetTrainParam() const {
|
||||
return tparam_;
|
||||
}
|
||||
|
||||
void Load(dmlc::Stream* fi) override {
|
||||
model_.Load(fi);
|
||||
this->cfg_.clear();
|
||||
}
|
||||
|
||||
void Load(dmlc::Stream* fi) override { model_.Load(fi); }
|
||||
void Save(dmlc::Stream* fo) const override {
|
||||
model_.Save(fo);
|
||||
}
|
||||
@@ -213,14 +246,39 @@ class GBTree : public GradientBooster {
|
||||
return !model_.trees.empty() || !model_.trees_to_update.empty();
|
||||
}
|
||||
|
||||
void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) const;
|
||||
|
||||
void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) override;
|
||||
|
||||
void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) const override;
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end) const override {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
|
||||
std::vector<Predictor const *> predictors{
|
||||
cpu_predictor_.get(),
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
gpu_predictor_.get()
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
};
|
||||
StringView msg{"Unsupported data type for inplace predict."};
|
||||
if (tparam_.predictor == PredictorType::kAuto) {
|
||||
// Try both predictor implementations
|
||||
for (auto const &p : predictors) {
|
||||
if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
LOG(FATAL) << msg;
|
||||
} else {
|
||||
bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
|
||||
tree_begin, tree_end);
|
||||
CHECK(success) << msg << std::endl
|
||||
<< "Current Predictor: "
|
||||
<< (tparam_.predictor == PredictorType::kCPUPredictor
|
||||
? "cpu_predictor"
|
||||
: "gpu_predictor");
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t>* features,
|
||||
@@ -301,29 +359,32 @@ class GBTree : public GradientBooster {
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
|
||||
"n_iteration), use model slicing instead.";
|
||||
this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
|
||||
this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
|
||||
}
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate) override {
|
||||
void PredictContribution(DMatrix* p_fmat,
|
||||
HostDeviceVector<bst_float>* out_contribs,
|
||||
uint32_t layer_begin, uint32_t layer_end, bool approximate,
|
||||
int, unsigned) override {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
|
||||
"n_iteration), using model slicing instead.";
|
||||
this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
|
||||
approximate);
|
||||
CHECK_EQ(tree_begin, 0)
|
||||
<< "Predict contribution supports only iteration end: (0, "
|
||||
"n_iteration), using model slicing instead.";
|
||||
this->GetPredictor()->PredictContribution(
|
||||
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
|
||||
bst_layer_t layer_begin, bst_layer_t layer_end,
|
||||
bool approximate) override {
|
||||
void PredictInteractionContributions(
|
||||
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
|
||||
uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
|
||||
CHECK(configured_);
|
||||
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
|
||||
CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
|
||||
"n_iteration), using model slicing instead.";
|
||||
this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
|
||||
tree_end, nullptr, approximate);
|
||||
CHECK_EQ(tree_begin, 0)
|
||||
<< "Predict interaction contribution supports only iteration end: (0, "
|
||||
"n_iteration), using model slicing instead.";
|
||||
this->GetPredictor()->PredictInteractionContributions(
|
||||
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
|
||||
}
|
||||
|
||||
[[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
|
||||
@@ -339,9 +400,8 @@ class GBTree : public GradientBooster {
|
||||
std::vector<HostDeviceVector<bst_node_t>>* out_position,
|
||||
std::vector<std::unique_ptr<RegTree>>* ret);
|
||||
|
||||
[[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
|
||||
bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
|
||||
DMatrix* f_dmat = nullptr) const;
|
||||
std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
|
||||
DMatrix* f_dmat = nullptr) const;
|
||||
|
||||
// commit new trees all at once
|
||||
virtual void CommitModel(TreesOneIter&& new_trees);
|
||||
@@ -356,17 +416,22 @@ class GBTree : public GradientBooster {
|
||||
bool showed_updater_warning_ {false};
|
||||
bool specified_updater_ {false};
|
||||
bool configured_ {false};
|
||||
// configurations for tree
|
||||
Args cfg_;
|
||||
// the updaters that can be applied to each of tree
|
||||
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
|
||||
// Predictors
|
||||
std::unique_ptr<Predictor> cpu_predictor_;
|
||||
std::unique_ptr<Predictor> gpu_predictor_{nullptr};
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
std::unique_ptr<Predictor> gpu_predictor_;
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
#if defined(XGBOOST_USE_ONEAPI)
|
||||
std::unique_ptr<Predictor> oneapi_predictor_;
|
||||
#endif // defined(XGBOOST_USE_ONEAPI)
|
||||
common::Monitor monitor_;
|
||||
};
|
||||
|
||||
} // namespace xgboost::gbm
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_GBM_GBTREE_H_
|
||||
|
||||
@@ -40,7 +40,6 @@
|
||||
#include "common/api_entry.h" // for XGBAPIThreadLocalEntry
|
||||
#include "common/charconv.h" // for to_chars, to_chars_result, NumericLimits, from_...
|
||||
#include "common/common.h" // for ToString, Split
|
||||
#include "common/error_msg.h" // for MaxFeatureSize, WarnOldSerialization
|
||||
#include "common/io.h" // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
|
||||
#include "common/observer.h" // for TrainingObserver
|
||||
#include "common/random.h" // for GlobalRandom
|
||||
@@ -357,6 +356,21 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
|
||||
using LearnerAPIThreadLocalStore =
|
||||
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
|
||||
|
||||
namespace {
|
||||
StringView ModelMsg() {
|
||||
return StringView{
|
||||
R"doc(
|
||||
If you are loading a serialized model (like pickle in Python, RDS in R) generated by
|
||||
older XGBoost, please export the model by calling `Booster.save_model` from that version
|
||||
first, then load it back in current version. See:
|
||||
|
||||
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
||||
|
||||
for more details about differences between saving model and serializing.
|
||||
)doc"};
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
class LearnerConfiguration : public Learner {
|
||||
private:
|
||||
std::mutex config_lock_;
|
||||
@@ -516,7 +530,7 @@ class LearnerConfiguration : public Learner {
|
||||
}
|
||||
|
||||
if (!Version::Same(origin_version)) {
|
||||
error::WarnOldSerialization();
|
||||
LOG(WARNING) << ModelMsg();
|
||||
return; // skip configuration if version is not matched
|
||||
}
|
||||
|
||||
@@ -547,7 +561,7 @@ class LearnerConfiguration : public Learner {
|
||||
for (size_t i = 0; i < n_metrics; ++i) {
|
||||
auto old_serialization = IsA<String>(j_metrics[i]);
|
||||
if (old_serialization) {
|
||||
error::WarnOldSerialization();
|
||||
LOG(WARNING) << ModelMsg();
|
||||
metric_names_[i] = get<String>(j_metrics[i]);
|
||||
} else {
|
||||
metric_names_[i] = get<String>(j_metrics[i]["name"]);
|
||||
@@ -749,7 +763,9 @@ class LearnerConfiguration : public Learner {
|
||||
CHECK(matrix.first.ptr);
|
||||
CHECK(!matrix.second.ref.expired());
|
||||
const uint64_t num_col = matrix.first.ptr->Info().num_col_;
|
||||
error::MaxFeatureSize(num_col);
|
||||
CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
|
||||
<< "Unfortunately, XGBoost does not support data matrices with "
|
||||
<< std::numeric_limits<unsigned>::max() << " features or greater";
|
||||
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
|
||||
}
|
||||
|
||||
@@ -1158,7 +1174,7 @@ class LearnerIO : public LearnerConfiguration {
|
||||
Json memory_snapshot;
|
||||
if (header[1] == '"') {
|
||||
memory_snapshot = Json::Load(StringView{buffer});
|
||||
error::WarnOldSerialization();
|
||||
LOG(WARNING) << ModelMsg();
|
||||
} else if (std::isalpha(header[1])) {
|
||||
memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
|
||||
} else {
|
||||
@@ -1177,7 +1193,7 @@ class LearnerIO : public LearnerConfiguration {
|
||||
header.resize(serialisation_header_.size());
|
||||
CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
|
||||
// Avoid printing the content in loaded header, which might be random binary code.
|
||||
CHECK(header == serialisation_header_) << error::OldSerialization();
|
||||
CHECK(header == serialisation_header_) << ModelMsg();
|
||||
int64_t sz {-1};
|
||||
CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
|
||||
if (!DMLC_IO_NO_ENDIAN_SWAP) {
|
||||
@@ -1397,8 +1413,6 @@ class LearnerImpl : public LearnerIO {
|
||||
this->CheckModelInitialized();
|
||||
|
||||
auto& out_predictions = this->GetThreadLocal().prediction_entry;
|
||||
out_predictions.version = 0;
|
||||
|
||||
this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
|
||||
if (type == PredictionType::kValue) {
|
||||
obj_->PredTransform(&out_predictions.predictions);
|
||||
|
||||
@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
|
||||
if (lj(0) >= Eps64()) {
|
||||
tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
|
||||
}
|
||||
assert(!isinf(ti_plus(i)));
|
||||
assert(!isinf(tj_minus(i)));
|
||||
assert(!std::isinf(ti_plus(i)));
|
||||
assert(!std::isinf(tj_minus(i)));
|
||||
});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include "../common/bitfield.h" // for RBitField8
|
||||
#include "../common/categorical.h" // for IsCat, Decision
|
||||
#include "../common/common.h" // for DivRoundUp
|
||||
#include "../common/error_msg.h" // for InplacePredictProxy
|
||||
#include "../common/math.h" // for CheckNAN
|
||||
#include "../common/threading_utils.h" // for ParallelFor
|
||||
#include "../data/adapter.h" // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
|
||||
@@ -468,6 +467,7 @@ class ColumnSplitHelper {
|
||||
void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
|
||||
auto const &tree = *model_.trees[tree_id];
|
||||
auto const &cats = tree.GetCategoriesMatrix();
|
||||
auto const has_categorical = tree.HasCategoricalSplit();
|
||||
bst_node_t n_nodes = tree.GetNodes().size();
|
||||
|
||||
for (bst_node_t nid = 0; nid < n_nodes; nid++) {
|
||||
@@ -484,10 +484,16 @@ class ColumnSplitHelper {
|
||||
}
|
||||
|
||||
auto const fvalue = feat.GetFvalue(split_index);
|
||||
auto const decision = tree.HasCategoricalSplit()
|
||||
? GetDecision<true>(node, nid, fvalue, cats)
|
||||
: GetDecision<false>(node, nid, fvalue, cats);
|
||||
if (decision) {
|
||||
if (has_categorical && common::IsCat(cats.split_type, nid)) {
|
||||
auto const node_categories =
|
||||
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
|
||||
if (!common::Decision(node_categories, fvalue)) {
|
||||
decision_bits_.Set(bit_index);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fvalue >= node.SplitCond()) {
|
||||
decision_bits_.Set(bit_index);
|
||||
}
|
||||
}
|
||||
@@ -505,7 +511,7 @@ class ColumnSplitHelper {
|
||||
if (missing_bits_.Check(bit_index)) {
|
||||
return node.DefaultChild();
|
||||
} else {
|
||||
return node.LeftChild() + !decision_bits_.Check(bit_index);
|
||||
return node.LeftChild() + decision_bits_.Check(bit_index);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -742,7 +748,7 @@ class CPUPredictor : public Predictor {
|
||||
PredictionCacheEntry *out_preds, uint32_t tree_begin,
|
||||
unsigned tree_end) const override {
|
||||
auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
|
||||
CHECK(proxy)<< error::InplacePredictProxy();
|
||||
CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
|
||||
CHECK(!p_m->Info().IsColumnSplit())
|
||||
<< "Inplace predict support for column-wise data split is not yet implemented.";
|
||||
auto x = proxy->Adapter();
|
||||
@@ -884,8 +890,9 @@ class CPUPredictor : public Predictor {
|
||||
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
auto page = batch.GetView();
|
||||
// parallel over local batch
|
||||
common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
|
||||
auto row_idx = batch.base_rowid + i;
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||
common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
|
||||
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
|
||||
if (feats.Size() == 0) {
|
||||
feats.Init(num_feature);
|
||||
|
||||
@@ -11,13 +11,10 @@
|
||||
#include <any> // for any, any_cast
|
||||
#include <memory>
|
||||
|
||||
#include "../collective/communicator-inl.cuh"
|
||||
#include "../common/bitfield.h"
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/common.h"
|
||||
#include "../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/error_msg.h" // for InplacePredictProxy
|
||||
#include "../data/device_adapter.cuh"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../data/proxy_dmatrix.h"
|
||||
@@ -113,11 +110,13 @@ struct SparsePageLoader {
|
||||
bool use_shared;
|
||||
SparsePageView data;
|
||||
float* smem;
|
||||
size_t entry_start;
|
||||
|
||||
__device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
|
||||
bst_row_t num_rows, size_t entry_start, float)
|
||||
: use_shared(use_shared),
|
||||
data(data) {
|
||||
data(data),
|
||||
entry_start(entry_start) {
|
||||
extern __shared__ float _smem[];
|
||||
smem = _smem;
|
||||
// Copy instances
|
||||
@@ -440,7 +439,7 @@ struct ShapSplitCondition {
|
||||
if (isnan(x)) {
|
||||
return is_missing_branch;
|
||||
}
|
||||
if (categories.Capacity() != 0) {
|
||||
if (categories.Size() != 0) {
|
||||
auto cat = static_cast<uint32_t>(x);
|
||||
return categories.Check(cat);
|
||||
} else {
|
||||
@@ -455,7 +454,7 @@ struct ShapSplitCondition {
|
||||
if (l.Data() == r.Data()) {
|
||||
return l;
|
||||
}
|
||||
if (l.Capacity() > r.Capacity()) {
|
||||
if (l.Size() > r.Size()) {
|
||||
thrust::swap(l, r);
|
||||
}
|
||||
for (size_t i = 0; i < r.Bits().size(); ++i) {
|
||||
@@ -467,7 +466,7 @@ struct ShapSplitCondition {
|
||||
// Combine two split conditions on the same feature
|
||||
XGBOOST_DEVICE void Merge(ShapSplitCondition other) {
|
||||
// Combine duplicate features
|
||||
if (categories.Capacity() != 0 || other.categories.Capacity() != 0) {
|
||||
if (categories.Size() != 0 || other.categories.Size() != 0) {
|
||||
categories = Intersect(categories, other.categories);
|
||||
} else {
|
||||
feature_lower_bound = max(feature_lower_bound, other.feature_lower_bound);
|
||||
@@ -623,199 +622,6 @@ size_t SharedMemoryBytes(size_t cols, size_t max_shared_memory_bytes) {
|
||||
}
|
||||
return shared_memory_bytes;
|
||||
}
|
||||
|
||||
using BitVector = LBitField64;
|
||||
|
||||
__global__ void MaskBitVectorKernel(
|
||||
SparsePageView data, common::Span<RegTree::Node const> d_nodes,
|
||||
common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
|
||||
common::Span<FeatureType const> d_tree_split_types,
|
||||
common::Span<std::uint32_t const> d_cat_tree_segments,
|
||||
common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
|
||||
common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
|
||||
std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
|
||||
std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
|
||||
auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row_idx >= num_rows) {
|
||||
return;
|
||||
}
|
||||
SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
|
||||
|
||||
std::size_t tree_offset = 0;
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
auto const tree_nodes = d_tree.d_tree.size();
|
||||
for (auto nid = 0; nid < tree_nodes; nid++) {
|
||||
auto const& node = d_tree.d_tree[nid];
|
||||
if (node.IsDeleted() || node.IsLeaf()) {
|
||||
continue;
|
||||
}
|
||||
auto const fvalue = loader.GetElement(row_idx, node.SplitIndex());
|
||||
auto const is_missing = common::CheckNAN(fvalue);
|
||||
auto const bit_index = row_idx * num_nodes + tree_offset + nid;
|
||||
if (is_missing) {
|
||||
missing_bits.Set(bit_index);
|
||||
} else {
|
||||
auto const decision = d_tree.HasCategoricalSplit()
|
||||
? GetDecision<true>(node, nid, fvalue, d_tree.cats)
|
||||
: GetDecision<false>(node, nid, fvalue, d_tree.cats);
|
||||
if (decision) {
|
||||
decision_bits.Set(bit_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
tree_offset += tree_nodes;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
|
||||
BitVector const& decision_bits,
|
||||
BitVector const& missing_bits, std::size_t num_nodes,
|
||||
std::size_t tree_offset) {
|
||||
bst_node_t nidx = 0;
|
||||
RegTree::Node n = tree.d_tree[nidx];
|
||||
while (!n.IsLeaf()) {
|
||||
auto const bit_index = ridx * num_nodes + tree_offset + nidx;
|
||||
if (missing_bits.Check(bit_index)) {
|
||||
nidx = n.DefaultChild();
|
||||
} else {
|
||||
nidx = n.LeftChild() + !decision_bits.Check(bit_index);
|
||||
}
|
||||
n = tree.d_tree[nidx];
|
||||
}
|
||||
return tree.d_tree[nidx].LeafValue();
|
||||
}
|
||||
|
||||
__global__ void PredictByBitVectorKernel(
|
||||
common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
|
||||
common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
|
||||
common::Span<FeatureType const> d_tree_split_types,
|
||||
common::Span<std::uint32_t const> d_cat_tree_segments,
|
||||
common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
|
||||
common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
|
||||
std::size_t tree_begin, std::size_t tree_end, std::size_t num_rows, std::size_t num_nodes,
|
||||
std::uint32_t num_group) {
|
||||
auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row_idx >= num_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::size_t tree_offset = 0;
|
||||
if (num_group == 1) {
|
||||
float sum = 0;
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
|
||||
tree_offset);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
}
|
||||
d_out_predictions[row_idx] += sum;
|
||||
} else {
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
auto const tree_group = d_tree_group[tree_idx];
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
bst_uint out_prediction_idx = row_idx * num_group + tree_group;
|
||||
d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
|
||||
row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class ColumnSplitHelper {
|
||||
public:
|
||||
explicit ColumnSplitHelper(Context const* ctx) : ctx_{ctx} {}
|
||||
|
||||
void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
|
||||
gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
|
||||
CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
|
||||
PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
|
||||
model.learner_model_param->num_output_group);
|
||||
}
|
||||
|
||||
private:
|
||||
using BitType = BitVector::value_type;
|
||||
|
||||
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
|
||||
bst_feature_t num_features, std::uint32_t num_group) const {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
dh::caching_device_vector<BitType> decision_storage{};
|
||||
dh::caching_device_vector<BitType> missing_storage{};
|
||||
|
||||
auto constexpr kBlockThreads = 128;
|
||||
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
|
||||
auto const shared_memory_bytes =
|
||||
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
|
||||
auto const use_shared = shared_memory_bytes != 0;
|
||||
|
||||
auto const num_nodes = model.nodes.Size();
|
||||
std::size_t batch_offset = 0;
|
||||
for (auto const& batch : dmat->GetBatches<SparsePage>()) {
|
||||
auto const num_rows = batch.Size();
|
||||
ResizeBitVectors(&decision_storage, &missing_storage, num_rows * num_nodes);
|
||||
BitVector decision_bits{dh::ToSpan(decision_storage)};
|
||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
std::size_t entry_start = 0;
|
||||
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
|
||||
|
||||
auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
|
||||
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()} (
|
||||
MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
|
||||
model.tree_segments.ConstDeviceSpan(), model.tree_group.ConstDeviceSpan(),
|
||||
model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
|
||||
model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
|
||||
decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_features, num_rows,
|
||||
entry_start, num_nodes, use_shared, nan(""));
|
||||
|
||||
AllReduceBitVectors(&decision_storage, &missing_storage);
|
||||
|
||||
dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
|
||||
PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
|
||||
out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
|
||||
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
|
||||
model.categories_tree_segments.ConstDeviceSpan(),
|
||||
model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
|
||||
decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_rows, num_nodes,
|
||||
num_group);
|
||||
|
||||
batch_offset += batch.Size() * num_group;
|
||||
}
|
||||
}
|
||||
|
||||
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
dh::caching_device_vector<BitType>* missing_storage) const {
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
|
||||
collective::Synchronize(ctx_->gpu_id);
|
||||
}
|
||||
|
||||
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
dh::caching_device_vector<BitType>* missing_storage,
|
||||
std::size_t total_bits) const {
|
||||
auto const size = BitVector::ComputeStorageSize(total_bits);
|
||||
if (decision_storage->size() < size) {
|
||||
decision_storage->resize(size);
|
||||
}
|
||||
thrust::fill(ctx_->CUDACtx()->CTP(), decision_storage->begin(), decision_storage->end(), 0);
|
||||
if (missing_storage->size() < size) {
|
||||
missing_storage->resize(size);
|
||||
}
|
||||
thrust::fill(ctx_->CUDACtx()->CTP(), missing_storage->begin(), missing_storage->end(), 0);
|
||||
}
|
||||
|
||||
Context const* ctx_;
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
class GPUPredictor : public xgboost::Predictor {
|
||||
@@ -891,11 +697,6 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
|
||||
|
||||
if (dmat->Info().IsColumnSplit()) {
|
||||
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
|
||||
return;
|
||||
}
|
||||
|
||||
if (dmat->PageExists<SparsePage>()) {
|
||||
size_t batch_offset = 0;
|
||||
for (auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
@@ -919,8 +720,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
|
||||
public:
|
||||
explicit GPUPredictor(Context const* ctx)
|
||||
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
|
||||
explicit GPUPredictor(Context const* ctx) : Predictor::Predictor{ctx} {}
|
||||
|
||||
~GPUPredictor() override {
|
||||
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
||||
@@ -990,7 +790,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
PredictionCacheEntry* out_preds, uint32_t tree_begin,
|
||||
unsigned tree_end) const override {
|
||||
auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
|
||||
CHECK(proxy) << error::InplacePredictProxy();
|
||||
CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
|
||||
auto x = proxy->Adapter();
|
||||
if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
|
||||
this->DispatchedInplacePredict<data::CupyAdapter,
|
||||
@@ -1219,8 +1019,6 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
ColumnSplitHelper column_split_helper_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
||||
|
||||
@@ -7,18 +7,6 @@
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
namespace xgboost::predictor {
|
||||
/** @brief Whether it should traverse to the left branch of a tree. */
|
||||
template <bool has_categorical>
|
||||
XGBOOST_DEVICE bool GetDecision(RegTree::Node const &node, bst_node_t nid, float fvalue,
|
||||
RegTree::CategoricalSplitMatrix const &cats) {
|
||||
if (has_categorical && common::IsCat(cats.split_type, nid)) {
|
||||
auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
|
||||
return common::Decision(node_categories, fvalue);
|
||||
} else {
|
||||
return fvalue < node.SplitCond();
|
||||
}
|
||||
}
|
||||
|
||||
template <bool has_missing, bool has_categorical>
|
||||
inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
|
||||
float fvalue, bool is_missing,
|
||||
@@ -26,7 +14,13 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
|
||||
if (has_missing && is_missing) {
|
||||
return node.DefaultChild();
|
||||
} else {
|
||||
return node.LeftChild() + !GetDecision<has_categorical>(node, nid, fvalue, cats);
|
||||
if (has_categorical && common::IsCat(cats.split_type, nid)) {
|
||||
auto node_categories =
|
||||
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
|
||||
return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
|
||||
} else {
|
||||
return node.LeftChild() + !(fvalue < node.SplitCond());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2019 XGBoost contributors
|
||||
*/
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_vector.h>
|
||||
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
|
||||
__global__ void ClearBuffersKernel(
|
||||
LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < result_buffer_output.Capacity()) {
|
||||
if (tid < result_buffer_output.Size()) {
|
||||
result_buffer_output.Clear(tid);
|
||||
}
|
||||
if (tid < result_buffer_input.Capacity()) {
|
||||
if (tid < result_buffer_input.Size()) {
|
||||
result_buffer_input.Clear(tid);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureInteractionConstraintDevice::ClearBuffers() {
|
||||
CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
|
||||
CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
|
||||
CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
|
||||
CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto const n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
|
||||
common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
ClearBuffersKernel,
|
||||
output_buffer_bits_, input_buffer_bits_);
|
||||
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
|
||||
ClearBuffers();
|
||||
|
||||
LBitField64 node_constraints = s_node_constraints_[nid];
|
||||
CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
|
||||
CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
|
||||
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
|
||||
common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
SetInputBufferKernel,
|
||||
feature_list, input_buffer_bits_);
|
||||
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
|
||||
LBitField64 left,
|
||||
LBitField64 right) {
|
||||
auto tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (tid > node.Capacity()) {
|
||||
if (tid > node.Size()) {
|
||||
return;
|
||||
}
|
||||
// enable constraints from feature
|
||||
node |= feature;
|
||||
// clear the buffer after use
|
||||
if (tid < feature.Capacity()) {
|
||||
if (tid < feature.Size()) {
|
||||
feature.Clear(tid);
|
||||
}
|
||||
|
||||
@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
|
||||
s_sets_, s_sets_ptr_);
|
||||
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
|
||||
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
|
||||
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
InteractionConstraintSplitKernel,
|
||||
|
||||
@@ -146,30 +146,27 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
||||
CombineGradientPair combine_;
|
||||
};
|
||||
|
||||
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
|
||||
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
|
||||
|
||||
GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
return {dmat->Info().num_row_, page, gpair};
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
|
||||
: batch_param_{std::move(batch_param)} {}
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
|
||||
size_t n_rows, BatchParam batch_param)
|
||||
: batch_param_{std::move(batch_param)},
|
||||
page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
|
||||
n_rows)) {}
|
||||
|
||||
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
if (!page_concatenated_) {
|
||||
// Concatenate all the external memory ELLPACK pages into a single in-memory page.
|
||||
page_.reset(nullptr);
|
||||
size_t offset = 0;
|
||||
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
|
||||
auto page = batch.Impl();
|
||||
if (!page_) {
|
||||
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
|
||||
page->row_stride, dmat->Info().num_row_);
|
||||
}
|
||||
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
|
||||
offset += num_elements;
|
||||
}
|
||||
@@ -178,8 +175,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
||||
return {dmat->Info().num_row_, page_.get(), gpair};
|
||||
}
|
||||
|
||||
UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
|
||||
: batch_param_{std::move(batch_param)}, subsample_(subsample) {}
|
||||
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
|
||||
: page_(page), subsample_(subsample) {}
|
||||
|
||||
GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
@@ -188,8 +185,7 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
|
||||
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<std::size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
return {dmat->Info().num_row_, page, gpair};
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
|
||||
@@ -240,10 +236,12 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
|
||||
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam&,
|
||||
float subsample)
|
||||
: subsample_(subsample),
|
||||
batch_param_{std::move(batch_param)},
|
||||
: page_(page),
|
||||
subsample_(subsample),
|
||||
threshold_(n_rows + 1, 0.0f),
|
||||
grad_sum_(n_rows, 0.0f) {}
|
||||
|
||||
@@ -254,19 +252,18 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
|
||||
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
|
||||
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
|
||||
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
|
||||
// Perform Poisson sampling in place.
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
|
||||
RandomWeight(common::GlobalRandom()())));
|
||||
return {n_rows, page, gpair};
|
||||
return {n_rows, page_, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
|
||||
BatchParam batch_param,
|
||||
float subsample)
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
||||
size_t n_rows,
|
||||
BatchParam batch_param,
|
||||
float subsample)
|
||||
: batch_param_(std::move(batch_param)),
|
||||
subsample_(subsample),
|
||||
threshold_(n_rows + 1, 0.0f),
|
||||
@@ -276,15 +273,16 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t
|
||||
GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
bst_row_t n_rows = dmat->Info().num_row_;
|
||||
size_t n_rows = dmat->Info().num_row_;
|
||||
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
|
||||
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
|
||||
|
||||
// Perform Poisson sampling in place.
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0),
|
||||
dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_),
|
||||
threshold_index,
|
||||
RandomWeight(common::GlobalRandom()())));
|
||||
|
||||
// Count the sampled rows.
|
||||
@@ -292,15 +290,16 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
|
||||
// Compact gradient pairs.
|
||||
gpair_.resize(sample_rows);
|
||||
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
|
||||
// Index the sample rows.
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
IsNonZero());
|
||||
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
sample_row_index_.begin(), ClearEmptyRows());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
|
||||
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
sample_row_index_.begin(),
|
||||
sample_row_index_.begin(),
|
||||
ClearEmptyRows());
|
||||
|
||||
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
@@ -318,13 +317,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
|
||||
const BatchParam& batch_param, float subsample,
|
||||
int sampling_method, bool is_external_memory) {
|
||||
// The ctx is kept here for future development of stream-based operations.
|
||||
GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
|
||||
size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample, int sampling_method) {
|
||||
monitor_.Init("gradient_based_sampler");
|
||||
|
||||
bool is_sampling = subsample < 1.0;
|
||||
bool is_external_memory = page->n_rows != n_rows;
|
||||
|
||||
if (is_sampling) {
|
||||
switch (sampling_method) {
|
||||
@@ -332,24 +331,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new UniformSampling(batch_param, subsample));
|
||||
strategy_.reset(new UniformSampling(page, subsample));
|
||||
}
|
||||
break;
|
||||
case TrainParam::kGradientBased:
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
strategy_.reset(
|
||||
new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
|
||||
strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "unknown sampling method";
|
||||
default:LOG(FATAL) << "unknown sampling method";
|
||||
}
|
||||
} else {
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
|
||||
strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
|
||||
} else {
|
||||
strategy_.reset(new NoSampling(batch_param));
|
||||
strategy_.reset(new NoSampling(page));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -363,11 +362,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
|
||||
return sample;
|
||||
}
|
||||
|
||||
size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
|
||||
common::Span<float> threshold,
|
||||
common::Span<float> grad_sum,
|
||||
size_t sample_rows) {
|
||||
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
|
||||
size_t GradientBasedSampler::CalculateThresholdIndex(
|
||||
common::Span<GradientPair> gpair, common::Span<float> threshold,
|
||||
common::Span<float> grad_sum, size_t sample_rows) {
|
||||
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
|
||||
std::numeric_limits<float>::max());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
|
||||
CombineGradientPair());
|
||||
thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
|
||||
@@ -380,5 +379,6 @@ size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair>
|
||||
thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
|
||||
return thrust::distance(dh::tbegin(grad_sum), min) + 1;
|
||||
}
|
||||
|
||||
}; // namespace tree
|
||||
}; // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright 2019 by XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/base.h>
|
||||
@@ -32,36 +32,37 @@ class SamplingStrategy {
|
||||
/*! \brief No sampling in in-memory mode. */
|
||||
class NoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit NoSampling(BatchParam batch_param);
|
||||
explicit NoSampling(EllpackPageImpl const* page);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
EllpackPageImpl const* page_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit ExternalMemoryNoSampling(BatchParam batch_param);
|
||||
ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
|
||||
BatchParam batch_param);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
std::unique_ptr<EllpackPageImpl> page_{nullptr};
|
||||
std::unique_ptr<EllpackPageImpl> page_;
|
||||
bool page_concatenated_{false};
|
||||
};
|
||||
|
||||
/*! \brief Uniform sampling in in-memory mode. */
|
||||
class UniformSampling : public SamplingStrategy {
|
||||
public:
|
||||
UniformSampling(BatchParam batch_param, float subsample);
|
||||
UniformSampling(EllpackPageImpl const* page, float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
EllpackPageImpl const* page_;
|
||||
float subsample_;
|
||||
};
|
||||
|
||||
@@ -83,12 +84,13 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||
/*! \brief Gradient-based sampling in in-memory mode.. */
|
||||
class GradientBasedSampling : public SamplingStrategy {
|
||||
public:
|
||||
GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
|
||||
GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
EllpackPageImpl const* page_;
|
||||
float subsample_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
dh::caching_device_vector<float> grad_sum_;
|
||||
@@ -104,11 +106,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
dh::device_vector<float> threshold_;
|
||||
dh::device_vector<float> grad_sum_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
dh::caching_device_vector<float> grad_sum_;
|
||||
std::unique_ptr<EllpackPageImpl> page_;
|
||||
dh::device_vector<GradientPair> gpair_;
|
||||
dh::device_vector<size_t> sample_row_index_;
|
||||
dh::caching_device_vector<size_t> sample_row_index_;
|
||||
};
|
||||
|
||||
/*! \brief Draw a sample of rows from a DMatrix.
|
||||
@@ -122,8 +124,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
*/
|
||||
class GradientBasedSampler {
|
||||
public:
|
||||
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample, int sampling_method, bool is_external_memory);
|
||||
GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
|
||||
const BatchParam& batch_param, float subsample, int sampling_method);
|
||||
|
||||
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
|
||||
|
||||
@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
|
||||
auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};
|
||||
|
||||
std::vector<bst_cat_t> cats;
|
||||
for (size_t i = 0; i < split.Capacity(); ++i) {
|
||||
for (size_t i = 0; i < split.Size(); ++i) {
|
||||
if (split.Check(i)) {
|
||||
cats.push_back(static_cast<bst_cat_t>(i));
|
||||
}
|
||||
@@ -1004,7 +1004,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
|
||||
auto segment = split_categories_segments_[i];
|
||||
auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
|
||||
common::KCatBitField const cat_bits(node_categories);
|
||||
for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
|
||||
for (size_t i = 0; i < cat_bits.Size(); ++i) {
|
||||
if (cat_bits.Check(i)) {
|
||||
categories.GetArray().emplace_back(i);
|
||||
}
|
||||
|
||||
@@ -176,7 +176,7 @@ struct GPUHistMakerDevice {
|
||||
Context const* ctx_;
|
||||
|
||||
public:
|
||||
EllpackPageImpl const* page{nullptr};
|
||||
EllpackPageImpl const* page;
|
||||
common::Span<FeatureType const> feature_types;
|
||||
BatchParam batch_param;
|
||||
|
||||
@@ -205,39 +205,41 @@ struct GPUHistMakerDevice {
|
||||
|
||||
std::unique_ptr<FeatureGroups> feature_groups;
|
||||
|
||||
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
|
||||
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
||||
|
||||
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
|
||||
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
|
||||
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
|
||||
BatchParam _batch_param)
|
||||
: evaluator_{_param, n_features, ctx->gpu_id},
|
||||
ctx_(ctx),
|
||||
page(_page),
|
||||
feature_types{_feature_types},
|
||||
param(std::move(_param)),
|
||||
column_sampler(column_sampler_seed),
|
||||
interaction_constraints(param, n_features),
|
||||
batch_param(std::move(_batch_param)) {
|
||||
sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method, is_external_memory));
|
||||
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method));
|
||||
if (!param.monotone_constraints.empty()) {
|
||||
// Copy assigning an empty vector causes an exception in MSVC debug builds
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
}
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
|
||||
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
sizeof(GradientSumT)));
|
||||
}
|
||||
|
||||
~GPUHistMakerDevice() = default;
|
||||
|
||||
void InitFeatureGroupsOnce() {
|
||||
if (!feature_groups) {
|
||||
CHECK(page);
|
||||
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
sizeof(GradientSumT)));
|
||||
}
|
||||
~GPUHistMakerDevice() { // NOLINT
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
// Note that the column sampler must be passed by value because it is not
|
||||
// thread safe
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
|
||||
auto const& info = dmat->Info();
|
||||
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
@@ -245,30 +247,26 @@ struct GPUHistMakerDevice {
|
||||
param.colsample_bytree);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||
ctx_->gpu_id);
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
|
||||
if (d_gpair.size() != dh_gpair->Size()) {
|
||||
d_gpair.resize(dh_gpair->Size());
|
||||
}
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
|
||||
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
|
||||
|
||||
quantiser.reset(new GradientQuantiser(this->gpair));
|
||||
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
||||
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
|
||||
hist.Reset();
|
||||
|
||||
this->InitFeatureGroupsOnce();
|
||||
}
|
||||
|
||||
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
|
||||
@@ -810,11 +808,12 @@ class GPUHistMaker : public TreeUpdater {
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
maker.reset(new GPUHistMakerDevice<GradientSumT>(
|
||||
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
||||
*param, column_sampling_seed, info_->num_col_, batch_param));
|
||||
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
|
||||
column_sampling_seed, info_->num_col_, batch_param));
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised_ = true;
|
||||
|
||||
@@ -24,7 +24,7 @@ set -x
|
||||
|
||||
CUDA_VERSION=11.8.0
|
||||
NCCL_VERSION=2.16.5-1
|
||||
RAPIDS_VERSION=23.08
|
||||
RAPIDS_VERSION=23.06
|
||||
SPARK_VERSION=3.4.0
|
||||
JDK_VERSION=8
|
||||
|
||||
|
||||
@@ -23,11 +23,7 @@ class LintersPaths:
|
||||
"tests/python/test_predict.py",
|
||||
"tests/python/test_quantile_dmatrix.py",
|
||||
"tests/python/test_tree_regularization.py",
|
||||
"tests/python/test_shap.py",
|
||||
"tests/python-gpu/test_gpu_data_iterator.py",
|
||||
"tests/python-gpu/test_gpu_prediction.py",
|
||||
"tests/python-gpu/load_pickle.py",
|
||||
"tests/python-gpu/test_gpu_pickling.py",
|
||||
"tests/test_distributed/test_with_spark/",
|
||||
"tests/test_distributed/test_gpu_with_spark/",
|
||||
# demo
|
||||
@@ -41,7 +37,6 @@ class LintersPaths:
|
||||
"demo/guide-python/quantile_regression.py",
|
||||
"demo/guide-python/multioutput_regression.py",
|
||||
"demo/guide-python/learning_to_rank.py",
|
||||
"demo/aft_survival/aft_survival_viz_demo.py",
|
||||
# CI
|
||||
"tests/ci_build/lint_python.py",
|
||||
"tests/ci_build/test_r_package.py",
|
||||
@@ -71,7 +66,6 @@ class LintersPaths:
|
||||
"tests/python/test_dt.py",
|
||||
"tests/python/test_data_iterator.py",
|
||||
"tests/python-gpu/test_gpu_data_iterator.py",
|
||||
"tests/python-gpu/load_pickle.py",
|
||||
"tests/test_distributed/test_with_spark/test_data.py",
|
||||
"tests/test_distributed/test_gpu_with_spark/test_data.py",
|
||||
"tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
|
||||
@@ -84,7 +78,6 @@ class LintersPaths:
|
||||
"demo/guide-python/quantile_regression.py",
|
||||
"demo/guide-python/multioutput_regression.py",
|
||||
"demo/guide-python/learning_to_rank.py",
|
||||
"demo/aft_survival/aft_survival_viz_demo.py",
|
||||
# CI
|
||||
"tests/ci_build/lint_python.py",
|
||||
"tests/ci_build/test_r_package.py",
|
||||
@@ -121,13 +114,7 @@ def run_black(rel_path: str, fix: bool) -> bool:
|
||||
@cd(PY_PACKAGE)
|
||||
def run_isort(rel_path: str, fix: bool) -> bool:
|
||||
# Isort gets confused when trying to find the config file, so specified explicitly.
|
||||
cmd = [
|
||||
"isort",
|
||||
"--settings-path",
|
||||
PY_PACKAGE,
|
||||
f"--src={PY_PACKAGE}",
|
||||
os.path.join(ROOT, rel_path),
|
||||
]
|
||||
cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
|
||||
if not fix:
|
||||
cmd += ["--check"]
|
||||
|
||||
|
||||
@@ -5,18 +5,21 @@
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <bitset>
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../../src/collective/communicator-inl.cuh"
|
||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/collective/communicator-inl.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||
auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
|
||||
auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
|
||||
auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
@@ -28,69 +31,6 @@ TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
||||
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
void VerifyAllReduceBitwiseAND() {
|
||||
auto const rank = collective::GetRank();
|
||||
std::bitset<64> original{};
|
||||
original[rank] = true;
|
||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
|
||||
collective::Synchronize(rank);
|
||||
EXPECT_EQ(buffer.HostVector()[0], 0ULL);
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
|
||||
auto const n_gpus = common::AllVisibleGPUs();
|
||||
if (n_gpus <= 1) {
|
||||
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
|
||||
}
|
||||
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseAND);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void VerifyAllReduceBitwiseOR() {
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
std::bitset<64> original{};
|
||||
original[rank] = true;
|
||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
|
||||
collective::Synchronize(rank);
|
||||
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
|
||||
auto const n_gpus = common::AllVisibleGPUs();
|
||||
if (n_gpus <= 1) {
|
||||
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
|
||||
}
|
||||
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseOR);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void VerifyAllReduceBitwiseXOR() {
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
auto const rank = collective::GetRank();
|
||||
std::bitset<64> original{~0ULL};
|
||||
original[rank] = false;
|
||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
||||
collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
|
||||
collective::Synchronize(rank);
|
||||
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
|
||||
auto const n_gpus = common::AllVisibleGPUs();
|
||||
if (n_gpus <= 1) {
|
||||
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
|
||||
}
|
||||
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseXOR);
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2019 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include "../../../src/common/bitfield.h"
|
||||
@@ -14,7 +14,7 @@ TEST(BitField, Check) {
|
||||
static_cast<typename common::Span<LBitField64::value_type>::index_type>(
|
||||
storage.size())});
|
||||
size_t true_bit = 190;
|
||||
for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
|
||||
for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
|
||||
ASSERT_FALSE(bits.Check(i));
|
||||
}
|
||||
ASSERT_TRUE(bits.Check(true_bit));
|
||||
@@ -34,7 +34,7 @@ TEST(BitField, Check) {
|
||||
ASSERT_FALSE(bits.Check(i));
|
||||
}
|
||||
ASSERT_TRUE(bits.Check(true_bit));
|
||||
for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
|
||||
for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
|
||||
ASSERT_FALSE(bits.Check(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
/*!
|
||||
* Copyright 2019 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/copy.h>
|
||||
@@ -12,7 +12,7 @@ namespace xgboost {
|
||||
|
||||
__global__ void TestSetKernel(LBitField64 bits) {
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < bits.Capacity()) {
|
||||
if (tid < bits.Size()) {
|
||||
bits.Set(tid);
|
||||
}
|
||||
}
|
||||
@@ -36,16 +36,20 @@ TEST(BitField, GPUSet) {
|
||||
|
||||
std::vector<LBitField64::value_type> h_storage(storage.size());
|
||||
thrust::copy(storage.begin(), storage.end(), h_storage.begin());
|
||||
LBitField64 outputs{
|
||||
common::Span<LBitField64::value_type>{h_storage.data(), h_storage.data() + h_storage.size()}};
|
||||
|
||||
LBitField64 outputs {
|
||||
common::Span<LBitField64::value_type>{h_storage.data(),
|
||||
h_storage.data() + h_storage.size()}};
|
||||
for (size_t i = 0; i < kBits; ++i) {
|
||||
ASSERT_TRUE(outputs.Check(i));
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <bool is_and, typename Op>
|
||||
void TestGPULogic(Op op) {
|
||||
__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
|
||||
lhs |= rhs;
|
||||
}
|
||||
|
||||
TEST(BitField, GPUAnd) {
|
||||
uint32_t constexpr kBits = 128;
|
||||
dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
|
||||
dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
|
||||
@@ -53,32 +57,13 @@ void TestGPULogic(Op op) {
|
||||
auto rhs = LBitField64(dh::ToSpan(rhs_storage));
|
||||
thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
|
||||
thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
|
||||
dh::LaunchN(kBits, [=] __device__(auto) mutable { op(lhs, rhs); });
|
||||
TestOrKernel<<<1, kBits>>>(lhs, rhs);
|
||||
|
||||
std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
|
||||
thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
|
||||
LBitField64 outputs{{h_storage.data(), h_storage.data() + h_storage.size()}};
|
||||
if (is_and) {
|
||||
for (size_t i = 0; i < kBits; ++i) {
|
||||
ASSERT_FALSE(outputs.Check(i));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < kBits; ++i) {
|
||||
ASSERT_TRUE(outputs.Check(i));
|
||||
}
|
||||
LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
|
||||
for (size_t i = 0; i < kBits; ++i) {
|
||||
ASSERT_TRUE(outputs.Check(i));
|
||||
}
|
||||
}
|
||||
|
||||
void TestGPUAnd() {
|
||||
TestGPULogic<true>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs &= rhs; });
|
||||
}
|
||||
|
||||
void TestGPUOr() {
|
||||
TestGPULogic<false>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs |= rhs; });
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(BitField, GPUAnd) { TestGPUAnd(); }
|
||||
|
||||
TEST(BitField, GPUOr) { TestGPUOr(); }
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost
|
||||
@@ -83,9 +83,7 @@ template <typename BinIdxType>
|
||||
void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
|
||||
const GHistIndexMatrix& gmat) {
|
||||
for (auto i = 0ull; i < col.Size(); i++) {
|
||||
if (col.IsMissing(i)) {
|
||||
continue;
|
||||
}
|
||||
if (col.IsMissing(i)) continue;
|
||||
EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,14 +143,11 @@ TEST(HistUtil, DeviceSketchCategoricalFeatures) {
|
||||
|
||||
void TestMixedSketch() {
|
||||
size_t n_samples = 1000, n_features = 2, n_categories = 3;
|
||||
bst_bin_t n_bins = 64;
|
||||
|
||||
std::vector<float> data(n_samples * n_features);
|
||||
SimpleLCG gen;
|
||||
SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
|
||||
SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
|
||||
for (size_t i = 0; i < n_samples * n_features; ++i) {
|
||||
// two features, row major. The first column is numeric and the second is categorical.
|
||||
if (i % 2 == 0) {
|
||||
data[i] = std::floor(cat_d(&gen));
|
||||
} else {
|
||||
@@ -162,75 +159,12 @@ void TestMixedSketch() {
|
||||
m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||
m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);
|
||||
|
||||
auto cuts = DeviceSketch(0, m.get(), n_bins);
|
||||
ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
|
||||
auto cuts = DeviceSketch(0, m.get(), 64);
|
||||
ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
|
||||
}
|
||||
|
||||
TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
|
||||
|
||||
TEST(HistUtil, RemoveDuplicatedCategories) {
|
||||
bst_row_t n_samples = 512;
|
||||
bst_feature_t n_features = 3;
|
||||
bst_cat_t n_categories = 5;
|
||||
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
SimpleLCG rng;
|
||||
SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
|
||||
|
||||
dh::device_vector<Entry> sorted_entries(n_samples * n_features);
|
||||
for (std::size_t i = 0; i < n_samples; ++i) {
|
||||
for (bst_feature_t j = 0; j < n_features; ++j) {
|
||||
float fvalue{0.0f};
|
||||
// The second column is categorical
|
||||
if (j == 1) {
|
||||
fvalue = std::floor(cat_d(&rng));
|
||||
} else {
|
||||
fvalue = i;
|
||||
}
|
||||
sorted_entries[i * n_features + j] = Entry{j, fvalue};
|
||||
}
|
||||
}
|
||||
|
||||
MetaInfo info;
|
||||
info.num_col_ = n_features;
|
||||
info.num_row_ = n_samples;
|
||||
info.feature_types.HostVector() = std::vector<FeatureType>{
|
||||
FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
|
||||
ASSERT_EQ(info.feature_types.Size(), n_features);
|
||||
|
||||
HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
|
||||
cuts_ptr.SetDevice(0);
|
||||
|
||||
dh::device_vector<float> weight(n_samples * n_features, 0);
|
||||
dh::Iota(dh::ToSpan(weight));
|
||||
|
||||
dh::caching_device_vector<bst_row_t> columns_ptr(4);
|
||||
for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
|
||||
columns_ptr[i] = i * n_samples;
|
||||
}
|
||||
// sort into column major
|
||||
thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
|
||||
detail::EntryCompareOp());
|
||||
|
||||
detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
|
||||
&weight, &columns_ptr);
|
||||
|
||||
auto const& h_cptr = cuts_ptr.ConstHostVector();
|
||||
ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
|
||||
// check numerical
|
||||
for (std::size_t i = 0; i < n_samples; ++i) {
|
||||
ASSERT_EQ(weight[i], i * 3);
|
||||
}
|
||||
auto beg = n_samples + n_categories;
|
||||
for (std::size_t i = 0; i < n_samples; ++i) {
|
||||
ASSERT_EQ(weight[i + beg], i * 3 + 2);
|
||||
}
|
||||
// check categorical
|
||||
beg = n_samples;
|
||||
for (std::size_t i = 0; i < n_categories; ++i) {
|
||||
// all from the second column
|
||||
ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
|
||||
}
|
||||
TEST(HistUtil, DeviceSketchMixedFeatures) {
|
||||
TestMixedSketch();
|
||||
}
|
||||
|
||||
TEST(HistUtil, DeviceSketchMultipleColumns) {
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
/*!
|
||||
* Copyright (c) by XGBoost Contributors 2019
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <fstream> // for ofstream
|
||||
#include <fstream>
|
||||
|
||||
#include "../../../src/common/io.h"
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../helpers.h"
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
TEST(MemoryFixSizeBuffer, Seek) {
|
||||
size_t constexpr kSize { 64 };
|
||||
std::vector<int32_t> memory( kSize );
|
||||
@@ -89,123 +89,5 @@ TEST(IO, LoadSequentialFile) {
|
||||
|
||||
ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(IO, Resource) {
|
||||
{
|
||||
// test malloc basic
|
||||
std::size_t n = 128;
|
||||
std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
|
||||
ASSERT_EQ(resource->Size(), n);
|
||||
ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
|
||||
}
|
||||
|
||||
// test malloc resize
|
||||
auto test_malloc_resize = [](bool force_malloc) {
|
||||
std::size_t n = 64;
|
||||
std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
|
||||
auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
|
||||
std::iota(ptr, ptr + n, 0);
|
||||
|
||||
auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
|
||||
ASSERT_TRUE(malloc_resource);
|
||||
if (force_malloc) {
|
||||
malloc_resource->Resize<true>(n * 2);
|
||||
} else {
|
||||
malloc_resource->Resize<false>(n * 2);
|
||||
}
|
||||
for (std::size_t i = 0; i < n; ++i) {
|
||||
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
|
||||
}
|
||||
for (std::size_t i = n; i < 2 * n; ++i) {
|
||||
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
|
||||
}
|
||||
};
|
||||
test_malloc_resize(true);
|
||||
test_malloc_resize(false);
|
||||
|
||||
{
|
||||
// test mmap
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto path = tmpdir.path + "/testfile";
|
||||
|
||||
std::ofstream fout(path, std::ios::binary);
|
||||
double val{1.0};
|
||||
fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
|
||||
fout << 1.0 << std::endl;
|
||||
fout.close();
|
||||
|
||||
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
|
||||
ASSERT_EQ(resource->Size(), sizeof(double));
|
||||
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
|
||||
ASSERT_EQ(resource->DataAs<double>()[0], val);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(IO, PrivateMmapStream) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
auto path = tempdir.path + "/testfile";
|
||||
|
||||
// The page size on Linux is usually set to 4096, while the allocation granularity on
|
||||
// the Windows machine where this test is writted is 65536. We span the test to cover
|
||||
// all of them.
|
||||
std::size_t n_batches{64};
|
||||
std::size_t multiplier{2048};
|
||||
|
||||
std::vector<std::vector<std::int32_t>> batches;
|
||||
std::vector<std::size_t> offset{0ul};
|
||||
|
||||
using T = std::int32_t;
|
||||
|
||||
{
|
||||
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
|
||||
for (std::size_t i = 0; i < n_batches; ++i) {
|
||||
std::size_t size = (i + 1) * multiplier;
|
||||
std::vector<T> data(size, 0);
|
||||
std::iota(data.begin(), data.end(), i * i);
|
||||
|
||||
fo->Write(static_cast<std::uint64_t>(data.size()));
|
||||
fo->Write(data.data(), data.size() * sizeof(T));
|
||||
|
||||
std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
|
||||
offset.push_back(bytes);
|
||||
|
||||
batches.emplace_back(std::move(data));
|
||||
}
|
||||
}
|
||||
|
||||
// Turn size info offset
|
||||
std::partial_sum(offset.begin(), offset.end(), offset.begin());
|
||||
|
||||
// Test read
|
||||
for (std::size_t i = 0; i < n_batches; ++i) {
|
||||
std::size_t off = offset[i];
|
||||
std::size_t n = offset.at(i + 1) - offset[i];
|
||||
auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
|
||||
std::vector<T> data;
|
||||
|
||||
std::uint64_t size{0};
|
||||
ASSERT_TRUE(fi->Read(&size));
|
||||
ASSERT_EQ(fi->Tell(), sizeof(size));
|
||||
data.resize(size);
|
||||
|
||||
ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
|
||||
ASSERT_EQ(data, batches[i]);
|
||||
}
|
||||
|
||||
// Test consume
|
||||
for (std::size_t i = 0; i < n_batches; ++i) {
|
||||
std::size_t off = offset[i];
|
||||
std::size_t n = offset.at(i + 1) - offset[i];
|
||||
std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
|
||||
std::vector<T> data;
|
||||
|
||||
std::uint64_t size{0};
|
||||
ASSERT_TRUE(fi->Consume(&size));
|
||||
ASSERT_EQ(fi->Tell(), sizeof(size));
|
||||
data.resize(size);
|
||||
|
||||
ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
|
||||
ASSERT_EQ(data, batches[i]);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -41,6 +41,7 @@ std::string GetModelStr() {
|
||||
"num_class": "0",
|
||||
"num_feature": "10",
|
||||
"objective": "reg:linear",
|
||||
"predictor": "gpu_predictor",
|
||||
"tree_method": "gpu_hist",
|
||||
"updater": "grow_gpu_hist"
|
||||
},
|
||||
|
||||
@@ -1,108 +0,0 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <memory> // for make_shared, make_unique
|
||||
#include <numeric> // for iota
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/common/ref_resource_view.h"
|
||||
#include "dmlc/filesystem.h" // for TemporaryDirectory
|
||||
|
||||
namespace xgboost::common {
|
||||
TEST(RefResourceView, Basic) {
|
||||
std::size_t n_bytes = 1024;
|
||||
auto mem = std::make_shared<MallocResource>(n_bytes);
|
||||
{
|
||||
RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
|
||||
|
||||
RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
|
||||
mem};
|
||||
ASSERT_EQ(mem.use_count(), 3);
|
||||
ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
|
||||
ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
|
||||
}
|
||||
{
|
||||
RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
|
||||
1.5f};
|
||||
for (auto v : view) {
|
||||
ASSERT_EQ(v, 1.5f);
|
||||
}
|
||||
std::iota(view.begin(), view.end(), 0.0f);
|
||||
ASSERT_EQ(view.front(), 0.0f);
|
||||
ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
|
||||
|
||||
view.front() = 1.0f;
|
||||
view.back() = 2.0f;
|
||||
ASSERT_EQ(view.front(), 1.0f);
|
||||
ASSERT_EQ(view.back(), 2.0f);
|
||||
}
|
||||
ASSERT_EQ(mem.use_count(), 1);
|
||||
}
|
||||
|
||||
TEST(RefResourceView, IO) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto path = tmpdir.path + "/testfile";
|
||||
auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
|
||||
|
||||
{
|
||||
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
|
||||
ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
|
||||
}
|
||||
{
|
||||
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
|
||||
ASSERT_EQ(WriteVec(fo.get(), data),
|
||||
data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
|
||||
}
|
||||
{
|
||||
auto fi = std::make_unique<PrivateMmapConstStream>(
|
||||
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
|
||||
auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
|
||||
ASSERT_TRUE(ReadVec(fi.get(), &read));
|
||||
for (auto v : read) {
|
||||
ASSERT_EQ(v, 1ul);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RefResourceView, IOAligned) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto path = tmpdir.path + "/testfile";
|
||||
auto data = MakeFixedVecWithMalloc(123, 1.0f);
|
||||
|
||||
{
|
||||
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
|
||||
// + sizeof(float) for alignment
|
||||
ASSERT_EQ(WriteVec(fo.get(), data),
|
||||
data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
|
||||
}
|
||||
{
|
||||
auto fi = std::make_unique<PrivateMmapConstStream>(
|
||||
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
|
||||
// wrong type, float vs. double
|
||||
auto read = MakeFixedVecWithMalloc(123, 2.0);
|
||||
ASSERT_FALSE(ReadVec(fi.get(), &read));
|
||||
}
|
||||
{
|
||||
auto fi = std::make_unique<PrivateMmapConstStream>(
|
||||
path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
|
||||
auto read = MakeFixedVecWithMalloc(123, 2.0f);
|
||||
ASSERT_TRUE(ReadVec(fi.get(), &read));
|
||||
for (auto v : read) {
|
||||
ASSERT_EQ(v, 1ul);
|
||||
}
|
||||
}
|
||||
{
|
||||
// Test std::vector
|
||||
std::vector<float> data(123);
|
||||
std::iota(data.begin(), data.end(), 0.0f);
|
||||
auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
|
||||
// + sizeof(float) for alignment
|
||||
ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
|
||||
sizeof(RefResourceView<std::size_t>::size_type) +
|
||||
sizeof(float));
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
@@ -4,14 +4,14 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include "../../../src/common/io.h" // for PrivateMmapConstStream, AlignedResourceReadStream...
|
||||
#include "../../../src/data/ellpack_page.cuh"
|
||||
#include "../../../src/data/sparse_page_source.h"
|
||||
#include "../../../src/tree/param.h" // TrainParam
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
TEST(EllpackPageRawFormat, IO) {
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
@@ -22,17 +22,15 @@ TEST(EllpackPageRawFormat, IO) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string path = tmpdir.path + "/ellpack.page";
|
||||
|
||||
std::size_t n_bytes{0};
|
||||
{
|
||||
auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
|
||||
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
|
||||
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
|
||||
n_bytes += format->Write(ellpack, fo.get());
|
||||
format->Write(ellpack, fo.get());
|
||||
}
|
||||
}
|
||||
|
||||
EllpackPage page;
|
||||
std::unique_ptr<common::AlignedResourceReadStream> fi{
|
||||
std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
|
||||
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
|
||||
format->Read(&page, fi.get());
|
||||
|
||||
for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
|
||||
@@ -46,4 +44,5 @@ TEST(EllpackPageRawFormat, IO) {
|
||||
ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -26,32 +26,28 @@
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
||||
|
||||
namespace xgboost::data {
|
||||
TEST(GradientIndex, ExternalMemoryBaseRowID) {
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
TEST(GradientIndex, ExternalMemory) {
|
||||
Context ctx;
|
||||
auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
|
||||
.Device(ctx.gpu_id)
|
||||
.Batches(8)
|
||||
.GenerateSparsePageDMatrix("cache", true);
|
||||
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
|
||||
std::vector<size_t> base_rowids;
|
||||
std::vector<float> hessian(p_fmat->Info().num_row_, 1);
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
|
||||
std::vector<float> hessian(dmat->Info().num_row_, 1);
|
||||
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
|
||||
base_rowids.push_back(page.base_rowid);
|
||||
}
|
||||
|
||||
std::size_t i = 0;
|
||||
for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
|
||||
size_t i = 0;
|
||||
for (auto const &page : dmat->GetBatches<SparsePage>()) {
|
||||
ASSERT_EQ(base_rowids[i], page.base_rowid);
|
||||
++i;
|
||||
}
|
||||
|
||||
base_rowids.clear();
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
|
||||
for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
|
||||
base_rowids.push_back(page.base_rowid);
|
||||
}
|
||||
i = 0;
|
||||
for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
|
||||
for (auto const &page : dmat->GetBatches<SparsePage>()) {
|
||||
ASSERT_EQ(base_rowids[i], page.base_rowid);
|
||||
++i;
|
||||
}
|
||||
@@ -175,7 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
|
||||
gpu_ctx.gpu_id = 0;
|
||||
for (auto const &page : Xy->GetBatches<EllpackPage>(
|
||||
&gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
|
||||
from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
|
||||
from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
|
||||
}
|
||||
|
||||
for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
|
||||
@@ -203,15 +199,13 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
|
||||
|
||||
std::string from_sparse_buf;
|
||||
{
|
||||
common::AlignedMemWriteStream fo{&from_sparse_buf};
|
||||
auto n_bytes = columns_from_sparse.Write(&fo);
|
||||
ASSERT_EQ(fo.Tell(), n_bytes);
|
||||
common::MemoryBufferStream fo{&from_sparse_buf};
|
||||
columns_from_sparse.Write(&fo);
|
||||
}
|
||||
std::string from_ellpack_buf;
|
||||
{
|
||||
common::AlignedMemWriteStream fo{&from_ellpack_buf};
|
||||
auto n_bytes = columns_from_sparse.Write(&fo);
|
||||
ASSERT_EQ(fo.Tell(), n_bytes);
|
||||
common::MemoryBufferStream fo{&from_ellpack_buf};
|
||||
columns_from_sparse.Write(&fo);
|
||||
}
|
||||
ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
|
||||
}
|
||||
@@ -235,4 +229,5 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
|
||||
std::make_tuple(.6f, .4))); // dense columns
|
||||
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
} // namespace xgboost::data
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user