Improve external memory demo. (#7320)
* Use npy format. * Add evaluation. * Use make_regression.
This commit is contained in:
parent
e6a142fe70
commit
6cdcfe8128
@ -8,23 +8,24 @@ feature is not ready for production use yet.
|
|||||||
import os
|
import os
|
||||||
import xgboost
|
import xgboost
|
||||||
from typing import Callable, List, Tuple
|
from typing import Callable, List, Tuple
|
||||||
|
from sklearn.datasets import make_regression
|
||||||
import tempfile
|
import tempfile
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def make_batches(
|
def make_batches(
|
||||||
n_samples_per_batch: int, n_features: int, n_batches: int
|
n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
|
||||||
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
) -> List[Tuple[str, str]]:
|
||||||
"""Generate random batches."""
|
files: List[Tuple[str, str]] = []
|
||||||
X = []
|
|
||||||
y = []
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
for i in range(n_batches):
|
for i in range(n_batches):
|
||||||
_X = rng.randn(n_samples_per_batch, n_features)
|
X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
|
||||||
_y = rng.randn(n_samples_per_batch)
|
X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
|
||||||
X.append(_X)
|
y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
|
||||||
y.append(_y)
|
np.save(X_path, X)
|
||||||
return X, y
|
np.save(y_path, y)
|
||||||
|
files.append((X_path, y_path))
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
class Iterator(xgboost.DataIter):
|
class Iterator(xgboost.DataIter):
|
||||||
@ -38,8 +39,8 @@ class Iterator(xgboost.DataIter):
|
|||||||
|
|
||||||
def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
|
def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
X_path, y_path = self._file_paths[self._it]
|
X_path, y_path = self._file_paths[self._it]
|
||||||
X = np.loadtxt(X_path)
|
X = np.load(X_path)
|
||||||
y = np.loadtxt(y_path)
|
y = np.load(y_path)
|
||||||
assert X.shape[0] == y.shape[0]
|
assert X.shape[0] == y.shape[0]
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
@ -66,15 +67,7 @@ class Iterator(xgboost.DataIter):
|
|||||||
|
|
||||||
def main(tmpdir: str) -> xgboost.Booster:
|
def main(tmpdir: str) -> xgboost.Booster:
|
||||||
# generate some random data for demo
|
# generate some random data for demo
|
||||||
batches = make_batches(1024, 17, 31)
|
files = make_batches(1024, 17, 31, tmpdir)
|
||||||
files = []
|
|
||||||
for i, (X, y) in enumerate(zip(*batches)):
|
|
||||||
X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
|
|
||||||
np.savetxt(X_path, X)
|
|
||||||
y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
|
|
||||||
np.savetxt(y_path, y)
|
|
||||||
files.append((X_path, y_path))
|
|
||||||
|
|
||||||
it = Iterator(files)
|
it = Iterator(files)
|
||||||
# For non-data arguments, specify it here once instead of passing them by the `next`
|
# For non-data arguments, specify it here once instead of passing them by the `next`
|
||||||
# method.
|
# method.
|
||||||
@ -83,7 +76,7 @@ def main(tmpdir: str) -> xgboost.Booster:
|
|||||||
|
|
||||||
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
|
# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
|
||||||
# caveats. This is still an experimental feature.
|
# caveats. This is still an experimental feature.
|
||||||
booster = xgboost.train({"tree_method": "approx"}, Xy)
|
booster = xgboost.train({"tree_method": "approx"}, Xy, evals=[(Xy, "Train")])
|
||||||
return booster
|
return booster
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user