Export Python Interface for external memory. (#7070)
* Add Python iterator interface. * Add tests. * Add demo. * Add documents. * Handle empty dataset.
This commit is contained in:
@@ -1,14 +1,17 @@
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
project(api-demo LANGUAGES C VERSION 0.0.1)
|
||||
find_package(xgboost REQUIRED)
|
||||
project(xgboost-c-examples)
|
||||
|
||||
# xgboost is built as static libraries, all cxx dependencies need to be linked into the
|
||||
# executable.
|
||||
if (XGBOOST_BUILD_STATIC_LIB)
|
||||
enable_language(CXX)
|
||||
# find again for those cxx libraries.
|
||||
find_package(xgboost REQUIRED)
|
||||
endif(XGBOOST_BUILD_STATIC_LIB)
|
||||
add_subdirectory(basic)
|
||||
add_subdirectory(external-memory)
|
||||
|
||||
add_executable(api-demo c-api-demo.c)
|
||||
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
|
||||
enable_testing()
|
||||
add_test(
|
||||
NAME test_xgboost_demo_c_basic
|
||||
COMMAND api-demo
|
||||
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
|
||||
)
|
||||
add_test(
|
||||
NAME test_xgboost_demo_c_external_memory
|
||||
COMMAND external-memory-demo
|
||||
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR}
|
||||
)
|
||||
|
||||
13
demo/c-api/basic/CMakeLists.txt
Normal file
13
demo/c-api/basic/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
project(api-demo LANGUAGES C VERSION 0.0.1)
|
||||
find_package(xgboost REQUIRED)
|
||||
|
||||
# xgboost is built as static libraries, all cxx dependencies need to be linked into the
|
||||
# executable.
|
||||
if (XGBOOST_BUILD_STATIC_LIB)
|
||||
enable_language(CXX)
|
||||
# find again for those cxx libraries.
|
||||
find_package(xgboost REQUIRED)
|
||||
endif(XGBOOST_BUILD_STATIC_LIB)
|
||||
|
||||
add_executable(api-demo c-api-demo.c)
|
||||
target_link_libraries(api-demo PRIVATE xgboost::xgboost)
|
||||
@@ -24,8 +24,8 @@ int main(int argc, char** argv) {
|
||||
|
||||
// load the data
|
||||
DMatrixHandle dtrain, dtest;
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.train", silent, &dtrain));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.test", silent, &dtest));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));
|
||||
|
||||
// create the booster
|
||||
BoosterHandle booster;
|
||||
7
demo/c-api/external-memory/CMakeLists.txt
Normal file
7
demo/c-api/external-memory/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
project(external-memory-demo LANGUAGES C VERSION 0.0.1)
|
||||
|
||||
find_package(xgboost REQUIRED)
|
||||
|
||||
add_executable(external-memory-demo external_memory.c)
|
||||
target_link_libraries(external-memory-demo PRIVATE xgboost::xgboost)
|
||||
16
demo/c-api/external-memory/README.md
Normal file
16
demo/c-api/external-memory/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
Defining a Custom Data Iterator to Load Data from External Memory
|
||||
=================================================================
|
||||
|
||||
A simple demo for using custom data iterator with XGBoost. The feature is still
|
||||
**experimental** and not ready for production use. If you are not familiar with C API,
|
||||
please read its introduction in our tutorials and visit the basic demo first.
|
||||
|
||||
Defining Data Iterator
|
||||
----------------------
|
||||
|
||||
In the example, we define a custom data iterator with 2 methods: `reset` and `next`. The
|
||||
`next` method passes data into XGBoost and tells XGBoost whether the iterator has reached
|
||||
its end, and the `reset` method resets iterations. One important detail when using the C
|
||||
API for data iterator is users need to make sure that the data passed into `next` method
|
||||
must be kept in memory until the next iteration or `reset` is called. The external memory
|
||||
DMatrix is not limited to training, but also valid for other features like prediction.
|
||||
179
demo/c-api/external-memory/external_memory.c
Normal file
179
demo/c-api/external-memory/external_memory.c
Normal file
@@ -0,0 +1,179 @@
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*
|
||||
* \brief A simple example of using xgboost data callback API.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <xgboost/c_api.h>
|
||||
|
||||
#define safe_xgboost(err) \
|
||||
if ((err) != 0) { \
|
||||
fprintf(stderr, "%s:%d: error in %s: %s\n", __FILE__, __LINE__, #err, \
|
||||
XGBGetLastError()); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#define N_BATCHS 32
|
||||
#define BATCH_LEN 512
|
||||
|
||||
/* Shorthands. */
|
||||
typedef DMatrixHandle DMatrix;
|
||||
typedef BoosterHandle Booster;
|
||||
|
||||
typedef struct _DataIter {
|
||||
/* Data of each batch. */
|
||||
float **data;
|
||||
/* Labels of each batch */
|
||||
float **labels;
|
||||
/* Length of each batch. */
|
||||
size_t *lengths;
|
||||
/* Total number of batches. */
|
||||
size_t n;
|
||||
/* Current iteration. */
|
||||
size_t cur_it;
|
||||
|
||||
/* Private fields */
|
||||
DMatrix _proxy;
|
||||
char _array[128];
|
||||
} DataIter;
|
||||
|
||||
#define safe_malloc(ptr) \
|
||||
if ((ptr) == NULL) { \
|
||||
fprintf(stderr, "%s:%d: Failed to allocate memory.\n", __FILE__, \
|
||||
__LINE__); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize with random data for demo. In practice the data should be loaded
|
||||
* from external memory. We just demonstrate how to use the iterator in
|
||||
* XGBoost.
|
||||
*
|
||||
* \param batch_size Number of elements for each batch. The demo here is only using 1
|
||||
* column.
|
||||
* \param n_batches Number of batches.
|
||||
*/
|
||||
void DataIterator_Init(DataIter *self, size_t batch_size, size_t n_batches) {
|
||||
self->n = n_batches;
|
||||
|
||||
self->lengths = (size_t *)malloc(self->n * sizeof(size_t));
|
||||
safe_malloc(self->lengths);
|
||||
for (size_t i = 0; i < self->n; ++i) {
|
||||
self->lengths[i] = batch_size;
|
||||
}
|
||||
|
||||
self->data = (float **)malloc(self->n * sizeof(float *));
|
||||
safe_malloc(self->data);
|
||||
self->labels = (float **)malloc(self->n * sizeof(float *));
|
||||
safe_malloc(self->labels);
|
||||
|
||||
/* Generate some random data. */
|
||||
for (size_t i = 0; i < self->n; ++i) {
|
||||
self->data[i] = (float *)malloc(self->lengths[i] * sizeof(float));
|
||||
safe_malloc(self->data[i]);
|
||||
for (size_t j = 0; j < self->lengths[i]; ++j) {
|
||||
float x = (float)rand() / (float)(RAND_MAX);
|
||||
self->data[i][j] = x;
|
||||
}
|
||||
|
||||
self->labels[i] = (float *)malloc(self->lengths[i] * sizeof(float));
|
||||
safe_malloc(self->labels[i]);
|
||||
for (size_t j = 0; j < self->lengths[i]; ++j) {
|
||||
float y = (float)rand() / (float)(RAND_MAX);
|
||||
self->labels[i][j] = y;
|
||||
}
|
||||
}
|
||||
|
||||
self->cur_it = 0;
|
||||
safe_xgboost(XGProxyDMatrixCreate(&self->_proxy));
|
||||
}
|
||||
|
||||
void DataIterator_Free(DataIter *self) {
|
||||
for (size_t i = 0; i < self->n; ++i) {
|
||||
free(self->data[i]);
|
||||
free(self->labels[i]);
|
||||
}
|
||||
free(self->data);
|
||||
free(self->lengths);
|
||||
safe_xgboost(XGDMatrixFree(self->_proxy));
|
||||
};
|
||||
|
||||
int DataIterator_Next(DataIterHandle handle) {
|
||||
DataIter *self = (DataIter *)(handle);
|
||||
if (self->cur_it == self->n) {
|
||||
self->cur_it = 0;
|
||||
return 0; /* At end */
|
||||
}
|
||||
|
||||
/* A JSON string encoding array interface (standard from numpy). */
|
||||
char array[] = "{\"data\": [%lu, false], \"shape\":[%lu, 1], \"typestr\": "
|
||||
"\"<f4\", \"version\": 3}";
|
||||
memset(self->_array, '\0', sizeof(self->_array));
|
||||
sprintf(self->_array, array, (size_t)self->data[self->cur_it],
|
||||
self->lengths[self->cur_it]);
|
||||
|
||||
safe_xgboost(XGProxyDMatrixSetDataDense(self->_proxy, self->_array));
|
||||
/* The data passed in the iterator must remain valid (not being freed until the next
|
||||
* iteration or reset) */
|
||||
safe_xgboost(XGDMatrixSetDenseInfo(self->_proxy, "label",
|
||||
self->labels[self->cur_it],
|
||||
self->lengths[self->cur_it], 1));
|
||||
self->cur_it++;
|
||||
return 1; /* Continue. */
|
||||
}
|
||||
|
||||
void DataIterator_Reset(DataIterHandle handle) {
|
||||
DataIter *self = (DataIter *)(handle);
|
||||
self->cur_it = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Train a regression model and save it into JSON model file.
|
||||
*/
|
||||
void TrainModel(DMatrix Xy) {
|
||||
/* Create booster for training. */
|
||||
Booster booster;
|
||||
DMatrix cache[] = {Xy};
|
||||
safe_xgboost(XGBoosterCreate(cache, 1, &booster));
|
||||
/* Use approx for external memory training. */
|
||||
safe_xgboost(XGBoosterSetParam(booster, "tree_method", "approx"));
|
||||
safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror"));
|
||||
|
||||
/* Start training. */
|
||||
const char *validation_names[1] = {"train"};
|
||||
const char *validation_result = NULL;
|
||||
size_t n_rounds = 10;
|
||||
for (size_t i = 0; i < n_rounds; ++i) {
|
||||
safe_xgboost(XGBoosterUpdateOneIter(booster, i, Xy));
|
||||
safe_xgboost(XGBoosterEvalOneIter(booster, i, cache, validation_names, 1,
|
||||
&validation_result));
|
||||
printf("%s\n", validation_result);
|
||||
}
|
||||
|
||||
/* Save the model to a JSON file. */
|
||||
safe_xgboost(XGBoosterSaveModel(booster, "model.json"));
|
||||
|
||||
safe_xgboost(XGBoosterFree(booster));
|
||||
}
|
||||
|
||||
int main() {
|
||||
DataIter iter;
|
||||
DataIterator_Init(&iter, BATCH_LEN, N_BATCHS);
|
||||
|
||||
/* Create DMatrix from iterator. During training, some cache files with the
|
||||
* prefix "cache-" will be generated in current directory */
|
||||
char config[] = "{\"missing\": NaN, \"cache_prefix\": \"cache\"}";
|
||||
DMatrix Xy;
|
||||
safe_xgboost(XGDMatrixCreateFromCallback(
|
||||
&iter, iter._proxy, DataIterator_Reset, DataIterator_Next, config, &Xy));
|
||||
|
||||
TrainModel(Xy);
|
||||
|
||||
safe_xgboost(XGDMatrixFree(Xy));
|
||||
|
||||
DataIterator_Free(&iter);
|
||||
return 0;
|
||||
}
|
||||
@@ -1,22 +1,92 @@
|
||||
"""Experimental support for external memory. This is similar to the one in
|
||||
`quantile_data_iterator.py`, but for external memory instead of Quantile DMatrix. The
|
||||
feature is not ready for production use yet.
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
"""
|
||||
import os
|
||||
import xgboost as xgb
|
||||
import xgboost
|
||||
from typing import Callable, List, Tuple
|
||||
import tempfile
|
||||
import numpy as np
|
||||
|
||||
### simple example for using external memory version
|
||||
|
||||
# this is the only difference, add a # followed by a cache prefix name
|
||||
# several cache file with the prefix will be generated
|
||||
# currently only support convert from libsvm file
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train#dtrain.cache'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test#dtest.cache'))
|
||||
def make_batches(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int
|
||||
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
||||
"""Generate random batches."""
|
||||
X = []
|
||||
y = []
|
||||
rng = np.random.RandomState(1994)
|
||||
for i in range(n_batches):
|
||||
_X = rng.randn(n_samples_per_batch, n_features)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
return X, y
|
||||
|
||||
# specify validations set to watch performance
|
||||
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
|
||||
|
||||
# performance notice: set nthread to be the number of your real cpu
|
||||
# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
|
||||
#param['nthread']=num_real_cpu
|
||||
class Iterator(xgboost.DataIter):
|
||||
"""A custom iterator for loading files in batches."""
|
||||
def __init__(self, file_paths: List[Tuple[str, str]]):
|
||||
self._file_paths = file_paths
|
||||
self._it = 0
|
||||
# XGBoost will generate some cache files under current directory with the prefix
|
||||
# "cache"
|
||||
super().__init__(cache_prefix=os.path.join(".", "cache"))
|
||||
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||
X_path, y_path = self._file_paths[self._it]
|
||||
X = np.loadtxt(X_path)
|
||||
y = np.loadtxt(y_path)
|
||||
assert X.shape[0] == y.shape[0]
|
||||
return X, y
|
||||
|
||||
def next(self, input_data: Callable) -> int:
|
||||
"""Advance the iterator by 1 step and pass the data to XGBoost. This function is
|
||||
called by XGBoost during the construction of ``DMatrix``
|
||||
|
||||
"""
|
||||
if self._it == len(self._file_paths):
|
||||
# return 0 to let XGBoost know this is the end of iteration
|
||||
return 0
|
||||
|
||||
# input_data is a function passed in by XGBoost who has the similar signature to
|
||||
# the ``DMatrix`` constructor.
|
||||
X, y = self.load_file()
|
||||
input_data(data=X, label=y)
|
||||
self._it += 1
|
||||
return 1
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset the iterator to its beginning"""
|
||||
self._it = 0
|
||||
|
||||
|
||||
def main(tmpdir: str) -> xgboost.Booster:
|
||||
# generate some random data for demo
|
||||
batches = make_batches(1024, 17, 31)
|
||||
files = []
|
||||
for i, (X, y) in enumerate(zip(*batches)):
|
||||
X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
|
||||
np.savetxt(X_path, X)
|
||||
y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
|
||||
np.savetxt(y_path, y)
|
||||
files.append((X_path, y_path))
|
||||
|
||||
it = Iterator(files)
|
||||
# For non-data arguments, specify it here once instead of passing them by the `next`
|
||||
# method.
|
||||
missing = np.NaN
|
||||
Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
|
||||
|
||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
|
||||
# caveats. This is still an experimental feature.
|
||||
booster = xgboost.train({"tree_method": "approx"}, Xy)
|
||||
return booster
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
main(tmpdir)
|
||||
|
||||
@@ -85,7 +85,7 @@ def main():
|
||||
rounds = 100
|
||||
it = IterForDMatrixDemo()
|
||||
|
||||
# Use iterator, must be `DeviceQuantileDMatrix`
|
||||
# Use iterator, must be `DeviceQuantileDMatrix` for quantile DMatrix.
|
||||
m_with_it = xgboost.DeviceQuantileDMatrix(it)
|
||||
|
||||
# Use regular DMatrix.
|
||||
Reference in New Issue
Block a user