[pyspark] Cleanup data processing. (#8088)
- Use numpy stack for handling list of arrays. - Reuse concat function from dask. - Prepare for `QuantileDMatrix`. - Remove unused code. - Use iterator for prediction to avoid initializing xgboost model
This commit is contained in:
@@ -1,11 +1,9 @@
|
||||
import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import pytest
|
||||
import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
@@ -13,156 +11,90 @@ if tm.no_spark()["condition"]:
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
from xgboost.spark.data import (
|
||||
_row_tuple_list_to_feature_matrix_y_w,
|
||||
_convert_partition_data_to_dmatrix,
|
||||
)
|
||||
|
||||
from xgboost import DMatrix, XGBClassifier
|
||||
from xgboost.training import train as worker_train
|
||||
from .utils import SparkTestCase
|
||||
import logging
|
||||
|
||||
logging.getLogger("py4j").setLevel(logging.INFO)
|
||||
from xgboost.spark.data import alias, create_dmatrix_from_partitions, stack_series
|
||||
|
||||
|
||||
class DataTest(SparkTestCase):
|
||||
def test_sparse_dense_vector(self):
|
||||
def row_tup_iter(data):
|
||||
pdf = pd.DataFrame(data)
|
||||
yield pdf
|
||||
def test_stack() -> None:
|
||||
a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 2)
|
||||
|
||||
expected_ndarray = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
data = {"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]]}
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
list(row_tup_iter(data)),
|
||||
train=False,
|
||||
has_weight=False,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
a = pd.DataFrame({"a": [[1], [3]]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 1)
|
||||
|
||||
a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 2)
|
||||
|
||||
a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 1)
|
||||
|
||||
|
||||
def run_dmatrix_ctor(is_dqm: bool) -> None:
|
||||
rng = np.random.default_rng(0)
|
||||
dfs: List[pd.DataFrame] = []
|
||||
n_features = 16
|
||||
n_samples_per_batch = 16
|
||||
n_batches = 10
|
||||
feature_types = ["float"] * n_features
|
||||
|
||||
for i in range(n_batches):
|
||||
X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
|
||||
y = rng.normal(loc=0, size=n_samples_per_batch)
|
||||
m = rng.normal(loc=0, size=n_samples_per_batch)
|
||||
w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
|
||||
w -= w.min()
|
||||
|
||||
valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
|
||||
)
|
||||
self.assertIsNone(y)
|
||||
self.assertIsNone(w)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
if is_dqm:
|
||||
for j in range(X.shape[1]):
|
||||
df[f"feat-{j}"] = pd.Series(X[:, j])
|
||||
else:
|
||||
df[alias.data] = pd.Series(list(X))
|
||||
dfs.append(df)
|
||||
|
||||
data["label"] = [1, 0]
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
row_tup_iter(data),
|
||||
train=True,
|
||||
has_weight=False,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
)
|
||||
self.assertIsNone(w)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
||||
kwargs = {"feature_types": feature_types}
|
||||
if is_dqm:
|
||||
cols = [f"feat-{i}" for i in range(n_features)]
|
||||
train_Xy, valid_Xy = create_dmatrix_from_partitions(iter(dfs), cols, kwargs)
|
||||
else:
|
||||
train_Xy, valid_Xy = create_dmatrix_from_partitions(iter(dfs), None, kwargs)
|
||||
|
||||
data["weight"] = [0.2, 0.8]
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
list(row_tup_iter(data)),
|
||||
train=True,
|
||||
has_weight=True,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
||||
self.assertTrue(np.array_equal(w, np.array(data["weight"])))
|
||||
assert valid_Xy is not None
|
||||
assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
|
||||
assert train_Xy.num_col() == n_features
|
||||
assert valid_Xy.num_col() == n_features
|
||||
|
||||
def test_dmatrix_creator(self):
|
||||
df = pd.concat(dfs, axis=0)
|
||||
df_train = df.loc[~df[alias.valid], :]
|
||||
df_valid = df.loc[df[alias.valid], :]
|
||||
|
||||
# This function acts as a pseudo-itertools.chain()
|
||||
def row_tup_iter(data):
|
||||
pdf = pd.DataFrame(data)
|
||||
yield pdf
|
||||
assert df_train.shape[0] == train_Xy.num_row()
|
||||
assert df_valid.shape[0] == valid_Xy.num_row()
|
||||
|
||||
# Standard testing DMatrix creation
|
||||
expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
||||
expected_labels = np.array([1, 0] * 100)
|
||||
expected_dmatrix = DMatrix(data=expected_features, label=expected_labels)
|
||||
# margin
|
||||
np.testing.assert_allclose(
|
||||
df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
|
||||
)
|
||||
# weight
|
||||
np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
|
||||
np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
|
||||
# label
|
||||
np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
|
||||
np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())
|
||||
|
||||
data = {
|
||||
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
||||
"label": [1, 0] * 100,
|
||||
}
|
||||
output_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=False,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
# You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
|
||||
# the same classifier and making sure the outputs are equal
|
||||
model = XGBClassifier()
|
||||
model.fit(expected_features, expected_labels)
|
||||
expected_preds = model.get_booster().predict(expected_dmatrix)
|
||||
output_preds = model.get_booster().predict(output_dmatrix)
|
||||
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
||||
np.testing.assert_equal(train_Xy.feature_types, feature_types)
|
||||
np.testing.assert_equal(valid_Xy.feature_types, feature_types)
|
||||
|
||||
# DMatrix creation with weights
|
||||
expected_weight = np.array([0.2, 0.8] * 100)
|
||||
expected_dmatrix = DMatrix(
|
||||
data=expected_features, label=expected_labels, weight=expected_weight
|
||||
)
|
||||
|
||||
data["weight"] = [0.2, 0.8] * 100
|
||||
output_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=True,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
model.fit(expected_features, expected_labels, sample_weight=expected_weight)
|
||||
expected_preds = model.get_booster().predict(expected_dmatrix)
|
||||
output_preds = model.get_booster().predict(output_dmatrix)
|
||||
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
||||
|
||||
def test_external_storage(self):
|
||||
# Instantiating base data (features, labels)
|
||||
features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
||||
labels = np.array([1, 0] * 100)
|
||||
normal_dmatrix = DMatrix(features, labels)
|
||||
test_dmatrix = DMatrix(features)
|
||||
|
||||
data = {
|
||||
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
||||
"label": [1, 0] * 100,
|
||||
}
|
||||
|
||||
# Creating the dmatrix based on storage
|
||||
temporary_path = tempfile.mkdtemp()
|
||||
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=False,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
# Testing without weights
|
||||
normal_booster = worker_train({}, normal_dmatrix)
|
||||
storage_booster = worker_train({}, storage_dmatrix)
|
||||
normal_preds = normal_booster.predict(test_dmatrix)
|
||||
storage_preds = storage_booster.predict(test_dmatrix)
|
||||
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
||||
shutil.rmtree(temporary_path)
|
||||
|
||||
# Testing weights
|
||||
weights = np.array([0.2, 0.8] * 100)
|
||||
normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
|
||||
data["weight"] = [0.2, 0.8] * 100
|
||||
|
||||
temporary_path = tempfile.mkdtemp()
|
||||
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=True,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
normal_booster = worker_train({}, normal_dmatrix)
|
||||
storage_booster = worker_train({}, storage_dmatrix)
|
||||
normal_preds = normal_booster.predict(test_dmatrix)
|
||||
storage_preds = storage_booster.predict(test_dmatrix)
|
||||
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
||||
shutil.rmtree(temporary_path)
|
||||
def test_dmatrix_ctor() -> None:
|
||||
run_dmatrix_ctor(False)
|
||||
|
||||
@@ -765,23 +765,22 @@ class XgboostLocalTest(SparkTestCase):
|
||||
self.reg_df_test_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result_with_weight:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_weight, atol=1e-3
|
||||
)
|
||||
assert np.isclose(
|
||||
row.prediction, row.expected_prediction_with_weight, atol=1e-3
|
||||
)
|
||||
|
||||
# with eval
|
||||
regressor_with_eval = SparkXGBRegressor(**self.reg_params_with_eval)
|
||||
model_with_eval = regressor_with_eval.fit(self.reg_df_train_with_eval_weight)
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
model_with_eval._xgb_sklearn_model.best_score,
|
||||
self.reg_with_eval_best_score,
|
||||
atol=1e-3,
|
||||
),
|
||||
f"Expected best score: {self.reg_with_eval_best_score}, "
|
||||
f"but get {model_with_eval._xgb_sklearn_model.best_score}",
|
||||
assert np.isclose(
|
||||
model_with_eval._xgb_sklearn_model.best_score,
|
||||
self.reg_with_eval_best_score,
|
||||
atol=1e-3,
|
||||
), (
|
||||
f"Expected best score: {self.reg_with_eval_best_score}, but ",
|
||||
f"get {model_with_eval._xgb_sklearn_model.best_score}",
|
||||
)
|
||||
|
||||
pred_result_with_eval = model_with_eval.transform(
|
||||
self.reg_df_test_with_eval_weight
|
||||
).collect()
|
||||
@@ -905,7 +904,7 @@ class XgboostLocalTest(SparkTestCase):
|
||||
# Check that regardless of what booster, _convert_to_model converts to the correct class type
|
||||
sklearn_classifier = classifier._convert_to_sklearn_model(
|
||||
clf_model.get_booster().save_raw("json"),
|
||||
clf_model.get_booster().save_config()
|
||||
clf_model.get_booster().save_config(),
|
||||
)
|
||||
assert isinstance(sklearn_classifier, XGBClassifier)
|
||||
assert sklearn_classifier.n_estimators == 200
|
||||
@@ -915,7 +914,7 @@ class XgboostLocalTest(SparkTestCase):
|
||||
|
||||
sklearn_regressor = regressor._convert_to_sklearn_model(
|
||||
reg_model.get_booster().save_raw("json"),
|
||||
reg_model.get_booster().save_config()
|
||||
reg_model.get_booster().save_config(),
|
||||
)
|
||||
assert isinstance(sklearn_regressor, XGBRegressor)
|
||||
assert sklearn_regressor.n_estimators == 200
|
||||
|
||||
Reference in New Issue
Block a user