From 5f7b5a69213bbf82238832a277f1a11046b0d57f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Jan 2024 14:52:48 +0800
Subject: [PATCH] Add tests for pickling with custom obj and metric. (#9943)

---
 doc/tutorials/custom_metric_obj.rst        |  1 -
 doc/tutorials/saving_model.rst             |  2 ++
 python-package/xgboost/sklearn.py          | 27 ++++++++++++--------
 python-package/xgboost/testing/__init__.py |  7 ++++++
 tests/python/test_pickling.py              | 27 ++++++++++++++++++++
 tests/python/test_with_sklearn.py          | 29 ++++++++++------------
 6 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 76ee1b3de..118a099c1 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -279,7 +279,6 @@ available at :ref:`sphx_glr_python_examples_custom_softmax.py`. Also, see
 Scikit-Learn Interface
 **********************
 
-
 The scikit-learn interface of XGBoost has some utilities to improve the integration with
 standard scikit-learn functions.  For instance, after XGBoost 1.6.0 users can use the cost
 function (not scoring functions) from scikit-learn out of the box:
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 54c217249..34b5430df 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -101,6 +101,8 @@ snapshot generated by an earlier version of XGBoost may result in errors or unde
 **If a model is persisted with** ``pickle.dump`` (Python) or ``saveRDS`` (R), **then the model may
 not be accessible in later versions of XGBoost.**
 
+.. _custom-obj-metric:
+
 ***************************
 Custom objective and metric
 ***************************
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index ea8d8d041..3383ae0b7 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -192,11 +192,16 @@ __model_doc = f"""
         Boosting learning rate (xgb's "eta")
     verbosity : Optional[int]
         The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
+
     objective : {SklObjective}
-        Specify the learning task and the corresponding learning objective or
-        a custom objective function to be used (see note below).
+
+        Specify the learning task and the corresponding learning objective or a custom
+        objective function to be used. For custom objective, see
+        :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
+        information.
+
     booster: Optional[str]
-        Specify which booster to use: gbtree, gblinear or dart.
+        Specify which booster to use: `gbtree`, `gblinear` or `dart`.
     tree_method: Optional[str]
         Specify which tree method to use.  Default to auto.  If this parameter is set to
         default, XGBoost will choose the most conservative option available.  It's
@@ -328,21 +333,21 @@ __model_doc = f"""
 
         Metric used for monitoring the training result and early stopping.  It can be a
         string or list of strings as names of predefined metric in XGBoost (See
-        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
-        user defined metric that looks like `sklearn.metrics`.
+        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
+        other user defined metric that looks like `sklearn.metrics`.
 
         If custom objective is also provided, then custom metric should implement the
         corresponding reverse link function.
 
         Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
-        object is provided, it's assumed to be a cost function and by default XGBoost will
-        minimize the result during early stopping.
+        object is provided, it's assumed to be a cost function and by default XGBoost
+        will minimize the result during early stopping.
 
-        For advanced usage on Early stopping like directly choosing to maximize instead of
-        minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
+        For advanced usage on Early stopping like directly choosing to maximize instead
+        of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
 
-        See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
-        for more.
+        See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
+        information.
 
         .. note::
 
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 6b8daf561..373ad1c58 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -815,6 +815,13 @@ def softprob_obj(
     return objective
 
 
+def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Least squared error."""
+    grad = y_pred - y_true
+    hess = np.ones(len(y_true))
+    return grad, hess
+
+
 class DirectoryExcursion:
     """Change directory.  Change back and optionally cleaning up the directory when
     exit.
diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py
index 2f4d77bf0..083a2a7fd 100644
--- a/tests/python/test_pickling.py
+++ b/tests/python/test_pickling.py
@@ -1,10 +1,13 @@
 import json
 import os
 import pickle
+import tempfile
 
 import numpy as np
+import pytest
 
 import xgboost as xgb
+from xgboost import testing as tm
 
 kRows = 100
 kCols = 10
@@ -61,3 +64,27 @@ class TestPickling:
         params = {"nthread": 8, "tree_method": "exact", "subsample": 0.5}
         config = self.run_model_pickling(params)
         check(config)
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_with_sklearn_obj_metric(self) -> None:
+        from sklearn.metrics import mean_squared_error
+
+        X, y = tm.datasets.make_regression()
+        reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error)
+        reg.fit(X, y)
+
+        pkl = pickle.dumps(reg)
+        reg_1 = pickle.loads(pkl)
+        assert callable(reg_1.objective)
+        assert callable(reg_1.eval_metric)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "model.json")
+            reg.save_model(path)
+
+            reg_2 = xgb.XGBRegressor()
+            reg_2.load_model(path)
+
+        assert not callable(reg_2.objective)
+        assert not callable(reg_2.eval_metric)
+        assert reg_2.eval_metric is None
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 1e49ed053..ee0085d51 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -504,15 +504,10 @@ def test_regression_with_custom_objective():
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    def objective_ls(y_true, y_pred):
-        grad = (y_pred - y_true)
-        hess = np.ones(len(y_true))
-        return grad, hess
-
     X, y = fetch_california_housing(return_X_y=True)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
+        xgb_model = xgb.XGBRegressor(objective=tm.ls_obj).fit(
             X[train_index], y[train_index]
         )
         preds = xgb_model.predict(X[test_index])
@@ -530,27 +525,29 @@ def test_regression_with_custom_objective():
     np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
 
 
+def logregobj(y_true, y_pred):
+    y_pred = 1.0 / (1.0 + np.exp(-y_pred))
+    grad = y_pred - y_true
+    hess = y_pred * (1.0 - y_pred)
+    return grad, hess
+
+
 def test_classification_with_custom_objective():
     from sklearn.datasets import load_digits
     from sklearn.model_selection import KFold
 
-    def logregobj(y_true, y_pred):
-        y_pred = 1.0 / (1.0 + np.exp(-y_pred))
-        grad = y_pred - y_true
-        hess = y_pred * (1.0 - y_pred)
-        return grad, hess
-
     digits = load_digits(n_class=2)
-    y = digits['target']
-    X = digits['data']
+    y = digits["target"]
+    X = digits["data"]
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBClassifier(objective=logregobj)
         xgb_model.fit(X[train_index], y[train_index])
         preds = xgb_model.predict(X[test_index])
         labels = y[test_index]
-        err = sum(1 for i in range(len(preds))
-                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        err = sum(
+            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+        ) / float(len(preds))
         assert err < 0.1
 
     # Test that the custom objective function is actually used