Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu> Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
169 lines
6.1 KiB
Python
169 lines
6.1 KiB
Python
import sys
|
|
import tempfile
|
|
import shutil
|
|
|
|
import pytest
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import testing as tm
|
|
|
|
if tm.no_spark()["condition"]:
|
|
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
|
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
|
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
|
|
|
from xgboost.spark.data import (
|
|
_row_tuple_list_to_feature_matrix_y_w,
|
|
_convert_partition_data_to_dmatrix,
|
|
)
|
|
|
|
from xgboost import DMatrix, XGBClassifier
|
|
from xgboost.training import train as worker_train
|
|
from .utils import SparkTestCase
|
|
import logging
|
|
|
|
logging.getLogger("py4j").setLevel(logging.INFO)
|
|
|
|
|
|
class DataTest(SparkTestCase):
|
|
def test_sparse_dense_vector(self):
|
|
def row_tup_iter(data):
|
|
pdf = pd.DataFrame(data)
|
|
yield pdf
|
|
|
|
expected_ndarray = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
|
data = {"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]]}
|
|
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
|
list(row_tup_iter(data)),
|
|
train=False,
|
|
has_weight=False,
|
|
has_fit_base_margin=False,
|
|
has_predict_base_margin=False,
|
|
)
|
|
self.assertIsNone(y)
|
|
self.assertIsNone(w)
|
|
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
|
|
|
data["label"] = [1, 0]
|
|
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
|
row_tup_iter(data),
|
|
train=True,
|
|
has_weight=False,
|
|
has_fit_base_margin=False,
|
|
has_predict_base_margin=False,
|
|
)
|
|
self.assertIsNone(w)
|
|
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
|
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
|
|
|
data["weight"] = [0.2, 0.8]
|
|
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
|
list(row_tup_iter(data)),
|
|
train=True,
|
|
has_weight=True,
|
|
has_fit_base_margin=False,
|
|
has_predict_base_margin=False,
|
|
)
|
|
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
|
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
|
self.assertTrue(np.array_equal(w, np.array(data["weight"])))
|
|
|
|
def test_dmatrix_creator(self):
|
|
|
|
# This function acts as a pseudo-itertools.chain()
|
|
def row_tup_iter(data):
|
|
pdf = pd.DataFrame(data)
|
|
yield pdf
|
|
|
|
# Standard testing DMatrix creation
|
|
expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
|
expected_labels = np.array([1, 0] * 100)
|
|
expected_dmatrix = DMatrix(data=expected_features, label=expected_labels)
|
|
|
|
data = {
|
|
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
|
"label": [1, 0] * 100,
|
|
}
|
|
output_dmatrix = _convert_partition_data_to_dmatrix(
|
|
[pd.DataFrame(data)],
|
|
has_weight=False,
|
|
has_validation=False,
|
|
has_base_margin=False,
|
|
)
|
|
# You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
|
|
# the same classifier and making sure the outputs are equal
|
|
model = XGBClassifier()
|
|
model.fit(expected_features, expected_labels)
|
|
expected_preds = model.get_booster().predict(expected_dmatrix)
|
|
output_preds = model.get_booster().predict(output_dmatrix)
|
|
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
|
|
|
# DMatrix creation with weights
|
|
expected_weight = np.array([0.2, 0.8] * 100)
|
|
expected_dmatrix = DMatrix(
|
|
data=expected_features, label=expected_labels, weight=expected_weight
|
|
)
|
|
|
|
data["weight"] = [0.2, 0.8] * 100
|
|
output_dmatrix = _convert_partition_data_to_dmatrix(
|
|
[pd.DataFrame(data)],
|
|
has_weight=True,
|
|
has_validation=False,
|
|
has_base_margin=False,
|
|
)
|
|
|
|
model.fit(expected_features, expected_labels, sample_weight=expected_weight)
|
|
expected_preds = model.get_booster().predict(expected_dmatrix)
|
|
output_preds = model.get_booster().predict(output_dmatrix)
|
|
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
|
|
|
def test_external_storage(self):
|
|
# Instantiating base data (features, labels)
|
|
features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
|
labels = np.array([1, 0] * 100)
|
|
normal_dmatrix = DMatrix(features, labels)
|
|
test_dmatrix = DMatrix(features)
|
|
|
|
data = {
|
|
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
|
"label": [1, 0] * 100,
|
|
}
|
|
|
|
# Creating the dmatrix based on storage
|
|
temporary_path = tempfile.mkdtemp()
|
|
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
|
[pd.DataFrame(data)],
|
|
has_weight=False,
|
|
has_validation=False,
|
|
has_base_margin=False,
|
|
)
|
|
|
|
# Testing without weights
|
|
normal_booster = worker_train({}, normal_dmatrix)
|
|
storage_booster = worker_train({}, storage_dmatrix)
|
|
normal_preds = normal_booster.predict(test_dmatrix)
|
|
storage_preds = storage_booster.predict(test_dmatrix)
|
|
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
|
shutil.rmtree(temporary_path)
|
|
|
|
# Testing weights
|
|
weights = np.array([0.2, 0.8] * 100)
|
|
normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
|
|
data["weight"] = [0.2, 0.8] * 100
|
|
|
|
temporary_path = tempfile.mkdtemp()
|
|
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
|
[pd.DataFrame(data)],
|
|
has_weight=True,
|
|
has_validation=False,
|
|
has_base_margin=False,
|
|
)
|
|
|
|
normal_booster = worker_train({}, normal_dmatrix)
|
|
storage_booster = worker_train({}, storage_dmatrix)
|
|
normal_preds = normal_booster.predict(test_dmatrix)
|
|
storage_preds = storage_booster.predict(test_dmatrix)
|
|
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
|
shutil.rmtree(temporary_path)
|