Improve external memory demo. (#7320)

* Use npy format.
* Add evaluation.
* Use make_regression.
This commit is contained in:
Jiaming Yuan 2021-10-17 11:25:24 +08:00 committed by GitHub
parent e6a142fe70
commit 6cdcfe8128
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -8,23 +8,24 @@ feature is not ready for production use yet.
import os import os
import xgboost import xgboost
from typing import Callable, List, Tuple from typing import Callable, List, Tuple
from sklearn.datasets import make_regression
import tempfile import tempfile
import numpy as np import numpy as np
def make_batches( def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
) -> Tuple[List[np.ndarray], List[np.ndarray]]: ) -> List[Tuple[str, str]]:
"""Generate random batches.""" files: List[Tuple[str, str]] = []
X = []
y = []
rng = np.random.RandomState(1994) rng = np.random.RandomState(1994)
for i in range(n_batches): for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features) X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
_y = rng.randn(n_samples_per_batch) X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
X.append(_X) y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
y.append(_y) np.save(X_path, X)
return X, y np.save(y_path, y)
files.append((X_path, y_path))
return files
class Iterator(xgboost.DataIter): class Iterator(xgboost.DataIter):
@ -38,8 +39,8 @@ class Iterator(xgboost.DataIter):
def load_file(self) -> Tuple[np.ndarray, np.ndarray]: def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
X_path, y_path = self._file_paths[self._it] X_path, y_path = self._file_paths[self._it]
X = np.loadtxt(X_path) X = np.load(X_path)
y = np.loadtxt(y_path) y = np.load(y_path)
assert X.shape[0] == y.shape[0] assert X.shape[0] == y.shape[0]
return X, y return X, y
@ -66,15 +67,7 @@ class Iterator(xgboost.DataIter):
def main(tmpdir: str) -> xgboost.Booster: def main(tmpdir: str) -> xgboost.Booster:
# generate some random data for demo # generate some random data for demo
batches = make_batches(1024, 17, 31) files = make_batches(1024, 17, 31, tmpdir)
files = []
for i, (X, y) in enumerate(zip(*batches)):
X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
np.savetxt(X_path, X)
y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
np.savetxt(y_path, y)
files.append((X_path, y_path))
it = Iterator(files) it = Iterator(files)
# For non-data arguments, specify it here once instead of passing them by the `next` # For non-data arguments, specify it here once instead of passing them by the `next`
# method. # method.
@ -83,7 +76,7 @@ def main(tmpdir: str) -> xgboost.Booster:
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
# caveats. This is still an experimental feature. # caveats. This is still an experimental feature.
booster = xgboost.train({"tree_method": "approx"}, Xy) booster = xgboost.train({"tree_method": "approx"}, Xy, evals=[(Xy, "Train")])
return booster return booster