[pyspark] support pred_contribs (#8633)
This commit is contained in:
@@ -8,6 +8,7 @@ from typing import Generator, Sequence, Type
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from xgboost.spark.data import pred_contribs
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
@@ -158,6 +159,103 @@ def reg_with_weight(
|
||||
)
|
||||
|
||||
|
||||
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test"))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
|
||||
X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
y = np.array([0, 1])
|
||||
reg1 = xgb.XGBRegressor()
|
||||
reg1.fit(X, y)
|
||||
predt0 = reg1.predict(X)
|
||||
pred_contrib0: np.ndarray = pred_contribs(reg1, X, None, False)
|
||||
|
||||
# convert np array to pyspark dataframe
|
||||
reg_df_train_data = [
|
||||
(Vectors.dense(X[0, :]), int(y[0])),
|
||||
(Vectors.sparse(3, {1: float(X[1, 1]), 2: float(X[1, 2])}), int(y[1])),
|
||||
]
|
||||
reg_df_train = spark.createDataFrame(reg_df_train_data, ["features", "label"])
|
||||
|
||||
reg_df_test = spark.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(X[0, :]),
|
||||
float(predt0[0]),
|
||||
pred_contrib0[0, :].tolist(),
|
||||
),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
float(predt0[1]),
|
||||
pred_contrib0[1, :].tolist(),
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_pred_contribs",
|
||||
],
|
||||
)
|
||||
yield RegData(reg_df_train, reg_df_test)
|
||||
|
||||
|
||||
MultiClfData = namedtuple("MultiClfData", ("multi_clf_df_train", "multi_clf_df_test"))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multi_clf_data(spark: SparkSession) -> Generator[MultiClfData, None, None]:
|
||||
|
||||
X = np.array([[1.0, 2.0, 3.0], [1.0, 2.0, 4.0], [0.0, 1.0, 5.5], [-1.0, -2.0, 1.0]])
|
||||
y = np.array([0, 0, 1, 2])
|
||||
cls1 = xgb.XGBClassifier()
|
||||
cls1.fit(X, y)
|
||||
predt0 = cls1.predict(X)
|
||||
proba0: np.ndarray = cls1.predict_proba(X)
|
||||
pred_contrib0: np.ndarray = pred_contribs(cls1, X, None, False)
|
||||
|
||||
# convert np array to pyspark dataframe
|
||||
multi_cls_df_train_data = [
|
||||
(Vectors.dense(X[0, :]), int(y[0])),
|
||||
(Vectors.dense(X[1, :]), int(y[1])),
|
||||
(Vectors.sparse(3, {1: float(X[2, 1]), 2: float(X[2, 2])}), int(y[2])),
|
||||
(Vectors.dense(X[3, :]), int(y[3])),
|
||||
]
|
||||
multi_clf_df_train = spark.createDataFrame(
|
||||
multi_cls_df_train_data, ["features", "label"]
|
||||
)
|
||||
|
||||
multi_clf_df_test = spark.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(X[0, :]),
|
||||
float(predt0[0]),
|
||||
proba0[0, :].tolist(),
|
||||
pred_contrib0[0, :].tolist(),
|
||||
),
|
||||
(
|
||||
Vectors.dense(X[1, :]),
|
||||
float(predt0[1]),
|
||||
proba0[1, :].tolist(),
|
||||
pred_contrib0[1, :].tolist(),
|
||||
),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
float(predt0[2]),
|
||||
proba0[2, :].tolist(),
|
||||
pred_contrib0[2, :].tolist(),
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_probability",
|
||||
"expected_pred_contribs",
|
||||
],
|
||||
)
|
||||
yield MultiClfData(multi_clf_df_train, multi_clf_df_test)
|
||||
|
||||
|
||||
ClfWithWeight = namedtuple(
|
||||
"ClfWithWeight",
|
||||
(
|
||||
@@ -264,10 +362,12 @@ def clf_data(spark: SparkSession) -> Generator[ClfData, None, None]:
|
||||
cl1.fit(X, y)
|
||||
predt0 = cl1.predict(X)
|
||||
proba0: np.ndarray = cl1.predict_proba(X)
|
||||
cl2 = xgb.XGBClassifier(max_depth=5, n_estimators=10, scale_pos_weight=4)
|
||||
pred_contrib0: np.ndarray = pred_contribs(cl1, X, None, True)
|
||||
cl2 = xgb.XGBClassifier(**cls_params)
|
||||
cl2.fit(X, y)
|
||||
predt1 = cl2.predict(X)
|
||||
proba1: np.ndarray = cl2.predict_proba(X)
|
||||
pred_contrib1: np.ndarray = pred_contribs(cl2, X, None, True)
|
||||
|
||||
# convert np array to pyspark dataframe
|
||||
cls_df_train_data = [
|
||||
@@ -286,23 +386,29 @@ def clf_data(spark: SparkSession) -> Generator[ClfData, None, None]:
|
||||
Vectors.dense(X[0, :]),
|
||||
int(predt0[0]),
|
||||
proba0[0, :].tolist(),
|
||||
pred_contrib0[0, :].tolist(),
|
||||
int(predt1[0]),
|
||||
proba1[0, :].tolist(),
|
||||
pred_contrib1[0, :].tolist(),
|
||||
),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
int(predt0[1]),
|
||||
proba0[1, :].tolist(),
|
||||
pred_contrib0[1, :].tolist(),
|
||||
int(predt1[1]),
|
||||
proba1[1, :].tolist(),
|
||||
pred_contrib1[1, :].tolist(),
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_probability",
|
||||
"expected_pred_contribs",
|
||||
"expected_prediction_with_params",
|
||||
"expected_probability_with_params",
|
||||
"expected_pred_contribs_with_params",
|
||||
],
|
||||
)
|
||||
yield ClfData(cls_params, cls_df_train, cls_df_train_large, cls_df_test)
|
||||
@@ -331,6 +437,16 @@ def get_params_map(params_kv: dict, estimator: Type) -> dict:
|
||||
|
||||
|
||||
class TestPySparkLocal:
|
||||
def test_regressor_basic(self, reg_data: RegData) -> None:
|
||||
regressor = SparkXGBRegressor(pred_contrib_col="pred_contribs")
|
||||
model = regressor.fit(reg_data.reg_df_train)
|
||||
pred_result = model.transform(reg_data.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
np.testing.assert_equal(row.prediction, row.expected_prediction)
|
||||
np.testing.assert_allclose(
|
||||
row.pred_contribs, row.expected_pred_contribs, atol=1e-3
|
||||
)
|
||||
|
||||
def test_regressor_with_weight_eval(self, reg_with_weight: RegWithWeight) -> None:
|
||||
# with weight
|
||||
regressor_with_weight = SparkXGBRegressor(weight_col="weight")
|
||||
@@ -385,6 +501,19 @@ class TestPySparkLocal:
|
||||
atol=1e-3,
|
||||
)
|
||||
|
||||
def test_multi_classifier_basic(self, multi_clf_data: MultiClfData) -> None:
|
||||
cls = SparkXGBClassifier(pred_contrib_col="pred_contribs")
|
||||
model = cls.fit(multi_clf_data.multi_clf_df_train)
|
||||
pred_result = model.transform(multi_clf_data.multi_clf_df_test).collect()
|
||||
for row in pred_result:
|
||||
np.testing.assert_equal(row.prediction, row.expected_prediction)
|
||||
np.testing.assert_allclose(
|
||||
row.probability, row.expected_probability, rtol=1e-3
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
row.pred_contribs, row.expected_pred_contribs, atol=1e-3
|
||||
)
|
||||
|
||||
def test_classifier_with_weight_eval(self, clf_with_weight: ClfWithWeight) -> None:
|
||||
# with weight
|
||||
classifier_with_weight = SparkXGBClassifier(weight_col="weight")
|
||||
@@ -459,13 +588,18 @@ class TestPySparkLocal:
|
||||
assert_model_compatible(model, tmpdir)
|
||||
|
||||
def test_classifier_basic(self, clf_data: ClfData) -> None:
|
||||
classifier = SparkXGBClassifier()
|
||||
classifier = SparkXGBClassifier(
|
||||
**clf_data.cls_params, pred_contrib_col="pred_contrib"
|
||||
)
|
||||
model = classifier.fit(clf_data.cls_df_train)
|
||||
pred_result = model.transform(clf_data.cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
np.testing.assert_equal(row.prediction, row.expected_prediction)
|
||||
np.testing.assert_equal(row.prediction, row.expected_prediction_with_params)
|
||||
np.testing.assert_allclose(
|
||||
row.probability, row.expected_probability, rtol=1e-3
|
||||
row.probability, row.expected_probability_with_params, rtol=1e-3
|
||||
)
|
||||
np.testing.assert_equal(
|
||||
row.pred_contrib, row.expected_pred_contribs_with_params
|
||||
)
|
||||
|
||||
def test_classifier_with_params(self, clf_data: ClfData) -> None:
|
||||
@@ -648,24 +782,6 @@ class XgboostLocalTest(SparkTestCase):
|
||||
# >>> cl.fit(X, y)
|
||||
# >>> cl.predict_proba(np.array([[1.0, 2.0, 3.0]]))
|
||||
# array([[0.5374299 , 0.23128504, 0.23128504]], dtype=float32)
|
||||
multi_cls_df_train_data = [
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.dense(1.0, 2.0, 4.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(-1.0, -2.0, 1.0), 2),
|
||||
]
|
||||
self.multi_cls_df_train = self.session.createDataFrame(
|
||||
multi_cls_df_train_data, ["features", "label"]
|
||||
)
|
||||
self.multi_cls_df_train_large = self.session.createDataFrame(
|
||||
multi_cls_df_train_data * 100, ["features", "label"]
|
||||
)
|
||||
self.multi_cls_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), [0.5374, 0.2312, 0.2312]),
|
||||
],
|
||||
["features", "expected_probability"],
|
||||
)
|
||||
|
||||
# Test classifier with both base margin and without
|
||||
# >>> import numpy as np
|
||||
@@ -872,24 +988,6 @@ class XgboostLocalTest(SparkTestCase):
|
||||
== "float64"
|
||||
)
|
||||
|
||||
def test_regressor_basic(self):
|
||||
regressor = SparkXGBRegressor()
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_multi_classifier(self):
|
||||
classifier = SparkXGBClassifier()
|
||||
model = classifier.fit(self.multi_cls_df_train)
|
||||
pred_result = model.transform(self.multi_cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_probability, rtol=1e-3)
|
||||
)
|
||||
|
||||
def test_regressor_with_params(self):
|
||||
regressor = SparkXGBRegressor(**self.reg_params)
|
||||
all_params = dict(
|
||||
|
||||
Reference in New Issue
Block a user