Obtain CSR matrix from DMatrix. (#8269)

This commit is contained in:
Jiaming Yuan
2022-09-29 20:41:43 +08:00
committed by GitHub
parent b14c44ee5e
commit 55cf24cc32
22 changed files with 400 additions and 74 deletions

View File

@@ -1,13 +1,14 @@
# -*- coding: utf-8 -*-
import os
import tempfile
import numpy as np
import xgboost as xgb
import scipy.sparse
import pytest
from scipy.sparse import rand, csr_matrix
import numpy as np
import pytest
import scipy.sparse
import testing as tm
from hypothesis import given, settings, strategies
from scipy.sparse import csr_matrix, rand
import xgboost as xgb
rng = np.random.RandomState(1)
@@ -433,3 +434,22 @@ class TestDMatrix:
def test_base_margin(self):
set_base_margin_info(np.asarray, xgb.DMatrix, "hist")
@given(
strategies.integers(0, 1000),
strategies.integers(0, 100),
strategies.fractions(0, 1),
)
@settings(deadline=None, print_blob=True)
def test_to_csr(self, n_samples, n_features, sparsity) -> None:
if n_samples == 0 or n_features == 0 or sparsity == 1.0:
csr = scipy.sparse.csr_matrix(np.empty((0, 0)))
else:
csr = tm.make_sparse_regression(n_samples, n_features, sparsity, False)[
0
].astype(np.float32)
m = xgb.DMatrix(data=csr)
ret = m.get_data()
np.testing.assert_equal(csr.indptr, ret.indptr)
np.testing.assert_equal(csr.data, ret.data)
np.testing.assert_equal(csr.indices, ret.indices)

View File

@@ -1,9 +1,16 @@
from typing import Dict, List, Any
from typing import Any, Dict, List
import numpy as np
import pytest
from hypothesis import given, settings, strategies
from scipy import sparse
from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
from testing import (
IteratorForTest,
make_batches,
make_batches_sparse,
make_categorical,
make_sparse_regression,
)
import xgboost as xgb
@@ -102,6 +109,7 @@ class TestQuantileDMatrix:
)
if tree_method == "gpu_hist":
import cudf
X = cudf.from_pandas(X)
y = cudf.from_pandas(y)
else:
@@ -154,6 +162,7 @@ class TestQuantileDMatrix:
X, y = make_categorical(n_samples, n_features, 13, onehot=False)
if tree_method == "gpu_hist":
import cudf
X = cudf.from_pandas(X)
y = cudf.from_pandas(y)
else:
@@ -198,9 +207,7 @@ class TestQuantileDMatrix:
def test_predict(self) -> None:
n_samples, n_features = 16, 2
X, y = make_categorical(
n_samples, n_features, n_categories=13, onehot=False
)
X, y = make_categorical(n_samples, n_features, n_categories=13, onehot=False)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist"}, Xy)
@@ -210,3 +217,24 @@ class TestQuantileDMatrix:
qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
b = booster.predict(qXy)
np.testing.assert_allclose(a, b)
# we don't test empty Quantile DMatrix in single node construction.
@given(
strategies.integers(1, 1000),
strategies.integers(1, 100),
strategies.fractions(0, 0.99),
)
@settings(deadline=None, print_blob=True)
def test_to_csr(self, n_samples: int, n_features: int, sparsity: float) -> None:
csr, y = make_sparse_regression(n_samples, n_features, sparsity, False)
csr = csr.astype(np.float32)
qdm = xgb.QuantileDMatrix(data=csr, label=y)
ret = qdm.get_data()
np.testing.assert_equal(csr.indptr, ret.indptr)
np.testing.assert_equal(csr.indices, ret.indices)
booster = xgb.train({"tree_method": "hist"}, dtrain=qdm)
np.testing.assert_allclose(
booster.predict(qdm), booster.predict(xgb.DMatrix(qdm.get_data()))
)

View File

@@ -577,6 +577,8 @@ def make_sparse_regression(
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y