Compare commits

...

8 Commits

Author SHA1 Message Date
Jiaming Yuan
584b45a9cc Release 1.5.0. (#7317) 2021-10-15 12:21:04 +08:00
Jiaming Yuan
30c1b5c54c [backport] Fix prediction with cat data in sklearn interface. (#7306) (#7312)
* Specify DMatrix parameter for pre-processing dataframe.
* Add document about the behaviour of prediction.
2021-10-12 18:49:57 +08:00
Jiaming Yuan
36e247aca4 Fix weighted samples in multi-class AUC. (#7300) (#7305) 2021-10-11 18:00:36 +08:00
Jiaming Yuan
c4aff733bb [backport] Fix cv verbose_eval (#7291) (#7296) 2021-10-08 14:24:27 +08:00
Jiaming Yuan
cdbfd21d31 [backport] Fix gamma neg log likelihood. (#7275) (#7285) 2021-10-05 23:01:11 +08:00
Jiaming Yuan
508a0b0dbd [backport] [R] Fix document for nthread. (#7263) (#7269) 2021-09-28 14:41:32 +08:00
Jiaming Yuan
e04e773f9f Add RC1 tag for building packages. (#7261) 2021-09-28 11:50:18 +08:00
Jiaming Yuan
1debabb321 Change version to 1.5.0. (#7258) 2021-09-26 13:27:54 +08:00
24 changed files with 169 additions and 73 deletions

View File

@@ -2,7 +2,7 @@ Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 1.5.0.1 Version: 1.5.0.1
Date: 2020-08-28 Date: 2021-10-13
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),

View File

@@ -11,6 +11,7 @@
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix). #' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data. #' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file. #' @param silent whether to suppress printing an informational message after loading from a file.
#' @param nthread Number of threads used for creating DMatrix.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list. #' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#' #'
#' @examples #' @examples

View File

@@ -4,7 +4,14 @@
\alias{xgb.DMatrix} \alias{xgb.DMatrix}
\title{Construct xgb.DMatrix object} \title{Construct xgb.DMatrix object}
\usage{ \usage{
xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...) xgb.DMatrix(
data,
info = list(),
missing = NA,
silent = FALSE,
nthread = NULL,
...
)
} }
\arguments{ \arguments{
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character \item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
@@ -18,6 +25,8 @@ It is useful when a 0 or some other extreme value represents missing values in d
\item{silent}{whether to suppress printing an informational message after loading from a file.} \item{silent}{whether to suppress printing an informational message after loading from a file.}
\item{nthread}{Number of threads used for creating DMatrix.}
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.} \item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
} }
\description{ \description{

View File

@@ -1 +1 @@
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-dev @xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@

View File

@@ -18,7 +18,7 @@ Making a Release
1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue. 1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
2. Bump release version. 2. Bump release version.
1. Modify ``CMakeLists.txt`` source tree, run CMake. 1. Modify ``CMakeLists.txt`` in source tree and ``cmake/Python_version.in`` if needed, run CMake.
2. Modify ``DESCRIPTION`` in R-package. 2. Modify ``DESCRIPTION`` in R-package.
3. Run ``change_version.sh`` in ``jvm-packages/dev`` 3. Run ``change_version.sh`` in ``jvm-packages/dev``
3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``. 3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j-example_2.12</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j-flink_2.12</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_2.12</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<properties> <properties>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j-spark_2.12</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
</parent> </parent>
<artifactId>xgboost4j_2.12</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>1.5.0-SNAPSHOT</version> <version>1.5.0</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -1 +1 @@
1.5.0-dev 1.5.0

View File

@@ -174,7 +174,9 @@ __model_doc = f'''
Device ordinal. Device ordinal.
validate_parameters : Optional[bool] validate_parameters : Optional[bool]
Give warnings for unknown parameter. Give warnings for unknown parameter.
predictor : Optional[str]
Force XGBoost to use specific predictor, available choices are [cpu_predictor,
gpu_predictor].
enable_categorical : bool enable_categorical : bool
.. versionadded:: 1.5.0 .. versionadded:: 1.5.0
@@ -807,7 +809,11 @@ class XGBModel(XGBModelBase):
# Inplace predict doesn't handle as many data types as DMatrix, but it's # Inplace predict doesn't handle as many data types as DMatrix, but it's
# sufficient for dask interface where input is simpiler. # sufficient for dask interface where input is simpiler.
predictor = self.get_params().get("predictor", None) predictor = self.get_params().get("predictor", None)
if predictor in ("auto", None) and self.booster != "gblinear": if (
not self.enable_categorical
and predictor in ("auto", None)
and self.booster != "gblinear"
):
return True return True
return False return False
@@ -834,7 +840,9 @@ class XGBModel(XGBModelBase):
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray: ) -> np.ndarray:
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration` """Predict with `X`. If the model is trained with early stopping, then `best_iteration`
is used automatically. is used automatically. For tree models, when data is on GPU, like cupy array or
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
automatically, otherwise it will run on CPU.
.. note:: This function is only thread safe for `gbtree` and `dart`. .. note:: This function is only thread safe for `gbtree` and `dart`.
@@ -862,6 +870,7 @@ class XGBModel(XGBModelBase):
Returns Returns
------- -------
prediction prediction
""" """
iteration_range = _convert_ntree_limit( iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range self.get_booster(), ntree_limit, iteration_range
@@ -886,7 +895,10 @@ class XGBModel(XGBModelBase):
pass pass
test = DMatrix( test = DMatrix(
X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs X, base_margin=base_margin,
missing=self.missing,
nthread=self.n_jobs,
enable_categorical=self.enable_categorical
) )
return self.get_booster().predict( return self.get_booster().predict(
data=test, data=test,

View File

@@ -472,13 +472,15 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
if is_new_callback: if is_new_callback:
assert all(isinstance(c, callback.TrainingCallback) assert all(isinstance(c, callback.TrainingCallback)
for c in callbacks), "You can't mix new and old callback styles." for c in callbacks), "You can't mix new and old callback styles."
if isinstance(verbose_eval, bool) and verbose_eval: if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(callback.EvaluationMonitor(period=verbose_eval, callbacks.append(
show_stdv=show_stdv)) callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
)
if early_stopping_rounds: if early_stopping_rounds:
callbacks.append(callback.EarlyStopping( callbacks.append(
rounds=early_stopping_rounds, maximize=maximize)) callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
)
callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True) callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True)
else: else:
callbacks = _configure_deprecated_callbacks( callbacks = _configure_deprecated_callbacks(

View File

@@ -291,7 +291,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
// labels is a vector of size n_samples. // labels is a vector of size n_samples.
float label = labels[idx % n_samples] == class_id; float label = labels[idx % n_samples] == class_id;
float w = get_weight(i % n_samples); float w = weights.empty() ? 1.0f : weights[d_sorted_idx[i] % n_samples];
float fp = (1.0 - label) * w; float fp = (1.0 - label) * w;
float tp = label * w; float tp = label * w;
return thrust::make_pair(fp, tp); return thrust::make_pair(fp, tp);

View File

@@ -309,10 +309,9 @@ struct EvalGammaNLogLik {
float constexpr kPsi = 1.0; float constexpr kPsi = 1.0;
bst_float theta = -1. / py; bst_float theta = -1. / py;
bst_float a = kPsi; bst_float a = kPsi;
// b = -std::log(-theta); float b = -std::log(-theta);
float b = 1.0f; // c = 1. / kPsi^2 * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi);
// c = 1. / kPsi * std::log(y/kPsi) - std::log(y) - common::LogGamma(1. / kPsi); // = 1.0f * std::log(y) - std::log(y) - 0 = 0
// = 1.0f * std::log(y) - std::log(y) - 0 = 0
float c = 0; float c = 0;
// general form for exponential family. // general form for exponential family.
return -((y * theta - b) / a + c); return -((y * theta - b) / a + c);

View File

@@ -143,7 +143,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
} }
xgboost::bst_float GetMetricEval(xgboost::Metric * metric, xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
xgboost::HostDeviceVector<xgboost::bst_float> preds, xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
std::vector<xgboost::bst_float> labels, std::vector<xgboost::bst_float> labels,
std::vector<xgboost::bst_float> weights, std::vector<xgboost::bst_float> weights,
std::vector<xgboost::bst_uint> groups) { std::vector<xgboost::bst_uint> groups) {

View File

@@ -86,7 +86,7 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
xgboost::bst_float GetMetricEval( xgboost::bst_float GetMetricEval(
xgboost::Metric * metric, xgboost::Metric * metric,
xgboost::HostDeviceVector<xgboost::bst_float> preds, xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
std::vector<xgboost::bst_float> labels, std::vector<xgboost::bst_float> labels,
std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(), std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>()); std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());

View File

@@ -90,6 +90,16 @@ TEST(Metric, DeclareUnifiedTest(MultiAUC)) {
}, },
{0, 1, 1}); // no class 2. {0, 1, 1}); // no class 2.
EXPECT_TRUE(std::isnan(auc)) << auc; EXPECT_TRUE(std::isnan(auc)) << auc;
HostDeviceVector<float> predts{
0.0f, 1.0f, 0.0f,
1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f,
0.0f, 0.0f, 1.0f,
};
std::vector<float> labels {1.0f, 0.0f, 2.0f, 1.0f};
auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
ASSERT_GT(auc, 0.714);
} }
TEST(Metric, DeclareUnifiedTest(RankingAUC)) { TEST(Metric, DeclareUnifiedTest(RankingAUC)) {

View File

@@ -13,9 +13,11 @@ class TestGPUEvalMetrics:
def test_roc_auc_binary(self, n_samples): def test_roc_auc_binary(self, n_samples):
self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples) self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize(
def test_roc_auc_multi(self, n_samples): "n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples) )
def test_roc_auc_multi(self, n_samples, weighted):
self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples, weighted)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize("n_samples", [4, 100, 1000])
def test_roc_auc_ltr(self, n_samples): def test_roc_auc_ltr(self, n_samples):

View File

@@ -59,6 +59,7 @@ def test_categorical():
) )
X = pd.DataFrame(X.todense()).astype("category") X = pd.DataFrame(X.todense()).astype("category")
clf.fit(X, y) clf.fit(X, y)
assert not clf._can_use_inplace_predict()
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
model = os.path.join(tempdir, "categorial.json") model = os.path.join(tempdir, "categorial.json")

View File

@@ -1,3 +1,4 @@
from typing import Union
import xgboost as xgb import xgboost as xgb
import pytest import pytest
import os import os
@@ -22,29 +23,47 @@ class TestCallbacks:
cls.X_valid = X[split:, ...] cls.X_valid = X[split:, ...]
cls.y_valid = y[split:, ...] cls.y_valid = y[split:, ...]
def run_evaluation_monitor(self, D_train, D_valid, rounds, verbose_eval): def run_evaluation_monitor(
evals_result = {} self,
with tm.captured_output() as (out, err): D_train: xgb.DMatrix,
xgb.train({'objective': 'binary:logistic', D_valid: xgb.DMatrix,
'eval_metric': 'error'}, D_train, rounds: int,
evals=[(D_train, 'Train'), (D_valid, 'Valid')], verbose_eval: Union[bool, int]
num_boost_round=rounds, ):
evals_result=evals_result, def check_output(output: str) -> None:
verbose_eval=verbose_eval) if int(verbose_eval) == 1:
output: str = out.getvalue().strip() # Should print each iteration info
assert len(output.split('\n')) == rounds
elif int(verbose_eval) > rounds:
# Should print first and latest iteration info
assert len(output.split('\n')) == 2
else:
# Should print info by each period additionaly to first and latest
# iteration
num_periods = rounds // int(verbose_eval)
# Extra information is required for latest iteration
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
assert len(output.split('\n')) == (
1 + num_periods + int(is_extra_info_required)
)
if int(verbose_eval) == 1: evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
# Should print each iteration info params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
assert len(output.split('\n')) == rounds with tm.captured_output() as (out, err):
elif int(verbose_eval) > rounds: xgb.train(
# Should print first and latest iteration info params, D_train,
assert len(output.split('\n')) == 2 evals=[(D_train, 'Train'), (D_valid, 'Valid')],
else: num_boost_round=rounds,
# Should print info by each period additionaly to first and latest iteration evals_result=evals_result,
num_periods = rounds // int(verbose_eval) verbose_eval=verbose_eval,
# Extra information is required for latest iteration )
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1) output: str = out.getvalue().strip()
assert len(output.split('\n')) == 1 + num_periods + int(is_extra_info_required) check_output(output)
with tm.captured_output() as (out, err):
xgb.cv(params, D_train, num_boost_round=rounds, verbose_eval=verbose_eval)
output = out.getvalue().strip()
check_output(output)
def test_evaluation_monitor(self): def test_evaluation_monitor(self):
D_train = xgb.DMatrix(self.X_train, self.y_train) D_train = xgb.DMatrix(self.X_train, self.y_train)

View File

@@ -124,6 +124,35 @@ class TestEvalMetrics:
skl_gamma_dev = mean_gamma_deviance(y, score) skl_gamma_dev = mean_gamma_deviance(y, score)
np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6) np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6)
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_lik(self) -> None:
import scipy.stats as stats
rng = np.random.default_rng(1994)
n_samples = 32
n_features = 10
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
alpha, loc, beta = 5.0, 11.1, 22
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
score = reg.predict(X)
booster = reg.get_booster()
nloglik = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1].split(":")[0])
# \beta_i = - (1 / \theta_i a)
# where \theta_i is the canonical parameter
# XGBoost uses the canonical link function of gamma in evaluation function.
# so \theta = - (1.0 / y)
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
beta = - (1.0 / (- (1.0 / y))) # == y
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
def run_roc_auc_binary(self, tree_method, n_samples): def run_roc_auc_binary(self, tree_method, n_samples):
import numpy as np import numpy as np
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
@@ -162,11 +191,11 @@ class TestEvalMetrics:
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize("n_samples", [100, 1000])
def test_roc_auc(self, n_samples): def test_roc_auc(self, n_samples):
self.run_roc_auc_binary("hist", n_samples) self.run_roc_auc_binary("hist", n_samples)
def run_roc_auc_multi(self, tree_method, n_samples): def run_roc_auc_multi(self, tree_method, n_samples, weighted):
import numpy as np import numpy as np
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
@@ -184,8 +213,14 @@ class TestEvalMetrics:
n_classes=n_classes, n_classes=n_classes,
random_state=rng random_state=rng
) )
if weighted:
weights = rng.randn(n_samples)
weights -= weights.min()
weights /= weights.max()
else:
weights = None
Xy = xgb.DMatrix(X, y) Xy = xgb.DMatrix(X, y, weight=weights)
booster = xgb.train( booster = xgb.train(
{ {
"tree_method": tree_method, "tree_method": tree_method,
@@ -197,16 +232,22 @@ class TestEvalMetrics:
num_boost_round=8, num_boost_round=8,
) )
score = booster.predict(Xy) score = booster.predict(Xy)
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr") skl_auc = roc_auc_score(
y, score, average="weighted", sample_weight=weights, multi_class="ovr"
)
auc = float(booster.eval(Xy).split(":")[1]) auc = float(booster.eval(Xy).split(":")[1])
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
X = rng.randn(*X.shape) X = rng.randn(*X.shape)
score = booster.predict(xgb.DMatrix(X)) score = booster.predict(xgb.DMatrix(X, weight=weights))
skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr") skl_auc = roc_auc_score(
auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1]) y, score, average="weighted", sample_weight=weights, multi_class="ovr"
np.testing.assert_allclose(skl_auc, auc, rtol=1e-6) )
auc = float(booster.eval(xgb.DMatrix(X, y, weight=weights)).split(":")[1])
np.testing.assert_allclose(skl_auc, auc, rtol=1e-5)
@pytest.mark.parametrize("n_samples", [4, 100, 1000]) @pytest.mark.parametrize(
def test_roc_auc_multi(self, n_samples): "n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
self.run_roc_auc_multi("hist", n_samples) )
def test_roc_auc_multi(self, n_samples, weighted):
self.run_roc_auc_multi("hist", n_samples, weighted)