[dask, sklearn] Fix predict proba. (#6566)

* For sklearn: - Handles user defined objective function. - Handles `softmax`. * For dask: - Use the implementation from sklearn, the previous implementation doesn't perform any extra handling.
2021-01-05 08:29:06 +08:00
parent 516a93d25c
commit 60cfd14349
5 changed files with 74 additions and 9 deletions
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -40,6 +40,7 @@ from .training import train as worker_train
 from .tracker import RabitTracker, get_host_ip
 from .sklearn import XGBModel, XGBRegressorBase, XGBClassifierBase, _objective_decorator
 from .sklearn import xgboost_model_doc
+from .sklearn import _cls_predict_proba


 if TYPE_CHECKING:
@@ -1504,6 +1505,10 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
                              early_stopping_rounds=early_stopping_rounds,
                              callbacks=callbacks)
        self._Booster = results['booster']
+
+        if not callable(self.objective):
+            self.objective = params["objective"]
+
        # pylint: disable=attribute-defined-outside-init
        self.evals_result_ = results['history']
        return self
@@ -1554,7 +1559,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
                                   data=test_dmatrix,
                                   validate_features=validate_features,
                                   output_margin=output_margin)
-        return pred_probs
+        return _cls_predict_proba(self.objective, pred_probs, da.vstack)

    # pylint: disable=arguments-differ,missing-docstring
    def predict_proba(
@@ -1593,6 +1598,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
            output_margin=output_margin,
            validate_features=validate_features
        )
+        if output_margin:
+            return pred_probs

        if self.n_classes_ == 2:
            preds = (pred_probs > 0.5).astype(int)
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -819,6 +819,20 @@ class XGBModel(XGBModelBase):
        return np.array(json.loads(b.get_dump(dump_format='json')[0])['bias'])


+def _cls_predict_proba(objective: Union[str, Callable], prediction: Any, vstack: Callable) -> Any:
+    if objective == 'multi:softmax':
+        raise ValueError('multi:softmax objective does not support predict_proba,'
+                         ' use `multi:softprob` or `binary:logistic` instead.')
+    if objective == 'multi:softprob' or callable(objective):
+        # Return prediction directly if if objective is defined by user since we don't
+        # know how to perform the transformation
+        return prediction
+    # Lastly the binary logistic function
+    classone_probs = prediction
+    classzero_probs = 1.0 - classone_probs
+    return vstack((classzero_probs, classone_probs)).transpose()
+
+
@xgboost_model_doc(
    "Implementation of the scikit-learn API for XGBoost classification.",
    ['model', 'objective'], extra_parameters='''
@@ -929,7 +943,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                              verbose_eval=verbose, xgb_model=model,
                              callbacks=callbacks)

-        self.objective = params["objective"]
+        if not callable(self.objective):
+            self.objective = params["objective"]
+
        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
@@ -1031,7 +1047,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        Returns
        -------
        prediction : numpy array
-            a numpy array with the probability of each data example being of a given class.
+            a numpy array of shape array-like of shape (n_samples, n_classes) with the
+            probability of each data example being of a given class.
        """
        test_dmatrix = DMatrix(X, base_margin=base_margin,
                               missing=self.missing, nthread=self.n_jobs)
@@ -1040,11 +1057,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        class_probs = self.get_booster().predict(test_dmatrix,
                                                 ntree_limit=ntree_limit,
                                                 validate_features=validate_features)
-        if self.objective == "multi:softprob":
-            return class_probs
-        classone_probs = class_probs
-        classzero_probs = 1.0 - classone_probs
-        return np.vstack((classzero_probs, classone_probs)).transpose()
+        return _cls_predict_proba(self.objective, class_probs, np.vstack)

    def evals_result(self):
        """Return the evaluation results.