[pyspark] support pred_contribs (#8633)

This commit is contained in:
Bobby Wang
2023-01-11 16:51:12 +08:00
committed by GitHub
parent cfa994d57f
commit 72ec0c5484
4 changed files with 287 additions and 67 deletions

View File

@@ -8,6 +8,7 @@ from typing import Generator, Sequence, Type
import numpy as np
import pytest
from xgboost.spark.data import pred_contribs
import xgboost as xgb
from xgboost import testing as tm
@@ -158,6 +159,103 @@ def reg_with_weight(
)
RegData = namedtuple("RegData", ("reg_df_train", "reg_df_test"))
@pytest.fixture
def reg_data(spark: SparkSession) -> Generator[RegData, None, None]:
X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
y = np.array([0, 1])
reg1 = xgb.XGBRegressor()
reg1.fit(X, y)
predt0 = reg1.predict(X)
pred_contrib0: np.ndarray = pred_contribs(reg1, X, None, False)
# convert np array to pyspark dataframe
reg_df_train_data = [
(Vectors.dense(X[0, :]), int(y[0])),
(Vectors.sparse(3, {1: float(X[1, 1]), 2: float(X[1, 2])}), int(y[1])),
]
reg_df_train = spark.createDataFrame(reg_df_train_data, ["features", "label"])
reg_df_test = spark.createDataFrame(
[
(
Vectors.dense(X[0, :]),
float(predt0[0]),
pred_contrib0[0, :].tolist(),
),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
float(predt0[1]),
pred_contrib0[1, :].tolist(),
),
],
[
"features",
"expected_prediction",
"expected_pred_contribs",
],
)
yield RegData(reg_df_train, reg_df_test)
MultiClfData = namedtuple("MultiClfData", ("multi_clf_df_train", "multi_clf_df_test"))
@pytest.fixture
def multi_clf_data(spark: SparkSession) -> Generator[MultiClfData, None, None]:
X = np.array([[1.0, 2.0, 3.0], [1.0, 2.0, 4.0], [0.0, 1.0, 5.5], [-1.0, -2.0, 1.0]])
y = np.array([0, 0, 1, 2])
cls1 = xgb.XGBClassifier()
cls1.fit(X, y)
predt0 = cls1.predict(X)
proba0: np.ndarray = cls1.predict_proba(X)
pred_contrib0: np.ndarray = pred_contribs(cls1, X, None, False)
# convert np array to pyspark dataframe
multi_cls_df_train_data = [
(Vectors.dense(X[0, :]), int(y[0])),
(Vectors.dense(X[1, :]), int(y[1])),
(Vectors.sparse(3, {1: float(X[2, 1]), 2: float(X[2, 2])}), int(y[2])),
(Vectors.dense(X[3, :]), int(y[3])),
]
multi_clf_df_train = spark.createDataFrame(
multi_cls_df_train_data, ["features", "label"]
)
multi_clf_df_test = spark.createDataFrame(
[
(
Vectors.dense(X[0, :]),
float(predt0[0]),
proba0[0, :].tolist(),
pred_contrib0[0, :].tolist(),
),
(
Vectors.dense(X[1, :]),
float(predt0[1]),
proba0[1, :].tolist(),
pred_contrib0[1, :].tolist(),
),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
float(predt0[2]),
proba0[2, :].tolist(),
pred_contrib0[2, :].tolist(),
),
],
[
"features",
"expected_prediction",
"expected_probability",
"expected_pred_contribs",
],
)
yield MultiClfData(multi_clf_df_train, multi_clf_df_test)
ClfWithWeight = namedtuple(
"ClfWithWeight",
(
@@ -264,10 +362,12 @@ def clf_data(spark: SparkSession) -> Generator[ClfData, None, None]:
cl1.fit(X, y)
predt0 = cl1.predict(X)
proba0: np.ndarray = cl1.predict_proba(X)
cl2 = xgb.XGBClassifier(max_depth=5, n_estimators=10, scale_pos_weight=4)
pred_contrib0: np.ndarray = pred_contribs(cl1, X, None, True)
cl2 = xgb.XGBClassifier(**cls_params)
cl2.fit(X, y)
predt1 = cl2.predict(X)
proba1: np.ndarray = cl2.predict_proba(X)
pred_contrib1: np.ndarray = pred_contribs(cl2, X, None, True)
# convert np array to pyspark dataframe
cls_df_train_data = [
@@ -286,23 +386,29 @@ def clf_data(spark: SparkSession) -> Generator[ClfData, None, None]:
Vectors.dense(X[0, :]),
int(predt0[0]),
proba0[0, :].tolist(),
pred_contrib0[0, :].tolist(),
int(predt1[0]),
proba1[0, :].tolist(),
pred_contrib1[0, :].tolist(),
),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
int(predt0[1]),
proba0[1, :].tolist(),
pred_contrib0[1, :].tolist(),
int(predt1[1]),
proba1[1, :].tolist(),
pred_contrib1[1, :].tolist(),
),
],
[
"features",
"expected_prediction",
"expected_probability",
"expected_pred_contribs",
"expected_prediction_with_params",
"expected_probability_with_params",
"expected_pred_contribs_with_params",
],
)
yield ClfData(cls_params, cls_df_train, cls_df_train_large, cls_df_test)
@@ -331,6 +437,16 @@ def get_params_map(params_kv: dict, estimator: Type) -> dict:
class TestPySparkLocal:
def test_regressor_basic(self, reg_data: RegData) -> None:
regressor = SparkXGBRegressor(pred_contrib_col="pred_contribs")
model = regressor.fit(reg_data.reg_df_train)
pred_result = model.transform(reg_data.reg_df_test).collect()
for row in pred_result:
np.testing.assert_equal(row.prediction, row.expected_prediction)
np.testing.assert_allclose(
row.pred_contribs, row.expected_pred_contribs, atol=1e-3
)
def test_regressor_with_weight_eval(self, reg_with_weight: RegWithWeight) -> None:
# with weight
regressor_with_weight = SparkXGBRegressor(weight_col="weight")
@@ -385,6 +501,19 @@ class TestPySparkLocal:
atol=1e-3,
)
def test_multi_classifier_basic(self, multi_clf_data: MultiClfData) -> None:
cls = SparkXGBClassifier(pred_contrib_col="pred_contribs")
model = cls.fit(multi_clf_data.multi_clf_df_train)
pred_result = model.transform(multi_clf_data.multi_clf_df_test).collect()
for row in pred_result:
np.testing.assert_equal(row.prediction, row.expected_prediction)
np.testing.assert_allclose(
row.probability, row.expected_probability, rtol=1e-3
)
np.testing.assert_allclose(
row.pred_contribs, row.expected_pred_contribs, atol=1e-3
)
def test_classifier_with_weight_eval(self, clf_with_weight: ClfWithWeight) -> None:
# with weight
classifier_with_weight = SparkXGBClassifier(weight_col="weight")
@@ -459,13 +588,18 @@ class TestPySparkLocal:
assert_model_compatible(model, tmpdir)
def test_classifier_basic(self, clf_data: ClfData) -> None:
classifier = SparkXGBClassifier()
classifier = SparkXGBClassifier(
**clf_data.cls_params, pred_contrib_col="pred_contrib"
)
model = classifier.fit(clf_data.cls_df_train)
pred_result = model.transform(clf_data.cls_df_test).collect()
for row in pred_result:
np.testing.assert_equal(row.prediction, row.expected_prediction)
np.testing.assert_equal(row.prediction, row.expected_prediction_with_params)
np.testing.assert_allclose(
row.probability, row.expected_probability, rtol=1e-3
row.probability, row.expected_probability_with_params, rtol=1e-3
)
np.testing.assert_equal(
row.pred_contrib, row.expected_pred_contribs_with_params
)
def test_classifier_with_params(self, clf_data: ClfData) -> None:
@@ -648,24 +782,6 @@ class XgboostLocalTest(SparkTestCase):
# >>> cl.fit(X, y)
# >>> cl.predict_proba(np.array([[1.0, 2.0, 3.0]]))
# array([[0.5374299 , 0.23128504, 0.23128504]], dtype=float32)
multi_cls_df_train_data = [
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.dense(1.0, 2.0, 4.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
(Vectors.dense(-1.0, -2.0, 1.0), 2),
]
self.multi_cls_df_train = self.session.createDataFrame(
multi_cls_df_train_data, ["features", "label"]
)
self.multi_cls_df_train_large = self.session.createDataFrame(
multi_cls_df_train_data * 100, ["features", "label"]
)
self.multi_cls_df_test = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), [0.5374, 0.2312, 0.2312]),
],
["features", "expected_probability"],
)
# Test classifier with both base margin and without
# >>> import numpy as np
@@ -872,24 +988,6 @@ class XgboostLocalTest(SparkTestCase):
== "float64"
)
def test_regressor_basic(self):
regressor = SparkXGBRegressor()
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
)
def test_multi_classifier(self):
classifier = SparkXGBClassifier()
model = classifier.fit(self.multi_cls_df_train)
pred_result = model.transform(self.multi_cls_df_test).collect()
for row in pred_result:
self.assertTrue(
np.allclose(row.probability, row.expected_probability, rtol=1e-3)
)
def test_regressor_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
all_params = dict(