Compare commits

..

13 Commits

Author SHA1 Message Date
fis
3e343159ef Release patch release 1.3.2 2021-01-13 17:35:00 +08:00
Jiaming Yuan
99e802f2ff Remove duplicated DMatrix. (#6592) (#6599) 2021-01-13 04:44:06 +08:00
Jiaming Yuan
6a29afb480 Fix evaluation result for XGBRanker. (#6594) (#6600)
* Remove duplicated code, which fixes typo `evals_result` -> `evals_result_`.
2021-01-13 04:42:43 +08:00
Jiaming Yuan
8e321adac8 Support Solaris. (#6578) (#6588)
* Add system header.

* Remove use of TR1 on Solaris

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
2021-01-11 02:31:29 +08:00
Jiaming Yuan
d0ec65520a [backport] Fix best_ntree_limit for dart and gblinear. (#6579) (#6587)
* [backport] Fix `best_ntree_limit` for dart and gblinear. (#6579)

* Backport num group test fix.
2021-01-11 01:46:05 +08:00
Jiaming Yuan
7aec915dcd [Backport] Rename data to X in predict_proba. (#6555) (#6586)
* [Breaking] Rename `data` to `X` in `predict_proba`. (#6555)

New Scikit-Learn version uses keyword argument, and `X` is the predefined
keyword.

* Use pip to install latest Python graphviz on Windows CI.

* Suppress health check.
2021-01-10 16:05:17 +08:00
Philip Hyunsu Cho
a78d0d4110 Release patch release 1.3.1 (#6543) 2020-12-21 23:22:32 -08:00
Jiaming Yuan
76c361431f Remove cupy.array_equal, since it's not compatible with cuPy 7.8 (#6528) (#6535)
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2020-12-20 15:11:50 +08:00
Jiaming Yuan
d95d02132a Fix handling of print period in EvaluationMonitor (#6499) (#6534)
Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>

Co-authored-by: ShvetsKS <33296480+ShvetsKS@users.noreply.github.com>
Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
2020-12-20 15:07:42 +08:00
Jiaming Yuan
7109c6c1f2 [backport] Move metric configuration into booster. (#6504) (#6533) 2020-12-20 10:36:32 +08:00
Jiaming Yuan
bce7ca313c [backport] Fix save_best. (#6523) 2020-12-18 20:00:29 +08:00
Jiaming Yuan
8be2cd8c91 Enable loading model from <1.0.0 trained with objective='binary:logitraw' (#6517) (#6524)
* Enable loading model from <1.0.0 trained with objective='binary:logitraw'

* Add binary:logitraw in model compatibility testing suite

* Feedback from @trivialfis: Override ProbToMargin() for LogisticRaw

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2020-12-18 04:10:09 +08:00
Philip Hyunsu Cho
c5f0cdbc72 Hot fix for libgomp vendoring (#6482)
* Hot fix for libgomp vendoring

* Set post0 in setup.py
2020-12-09 10:04:45 -08:00
33 changed files with 260 additions and 133 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.13)
project(xgboost LANGUAGES CXX C VERSION 1.3.0)
project(xgboost LANGUAGES CXX C VERSION 1.3.2)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)

4
Jenkinsfile vendored
View File

@@ -198,10 +198,10 @@ def BuildCUDA(args) {
"""
if (args.cuda_version == ref_cuda_ver) {
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl
${dockerRun} auditwheel_x86_64 ${docker_binary} auditwheel repair --plat ${wheel_tag} python-package/dist/*.whl
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
${dockerRun} auditwheel_x86_64 ${docker_binary} bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1"
"""
}
echo 'Stashing Python wheel...'

View File

@@ -1,7 +1,7 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 1.3.0.1
Version: 1.3.2.1
Date: 2020-08-28
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),

View File

@@ -2,7 +2,6 @@
# of saved model files from XGBoost version 0.90 and 1.0.x.
library(xgboost)
library(Matrix)
source('./generate_models_params.R')
set.seed(0)
metadata <- list(
@@ -53,11 +52,16 @@ generate_logistic_model <- function () {
y <- sample(0:1, size = metadata$kRows, replace = TRUE)
stopifnot(max(y) == 1, min(y) == 0)
data <- xgb.DMatrix(X, label = y, weight = w)
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
max_depth = metadata$kMaxDepth, objective = 'binary:logistic')
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, 'logit')
objective <- c('binary:logistic', 'binary:logitraw')
name <- c('logit', 'logitraw')
for (i in seq_len(length(objective))) {
data <- xgb.DMatrix(X, label = y, weight = w)
params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests,
max_depth = metadata$kMaxDepth, objective = objective[i])
booster <- xgb.train(params, data, nrounds = metadata$kRounds)
save_booster(booster, name[i])
}
}
generate_classification_model <- function () {

View File

@@ -39,6 +39,10 @@ run_booster_check <- function (booster, name) {
testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
metadata$kClasses)
} else if (name == 'logitraw') {
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
} else if (name == 'logit') {
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)

View File

@@ -55,7 +55,7 @@
#endif // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4)
#if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \
!defined(__CUDACC__)
!defined(__CUDACC__) && !defined(__sun) && !defined(sun)
#include <parallel/algorithm>
#define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z))
#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 1
#define XGBOOST_VER_MINOR 3
#define XGBOOST_VER_PATCH 0
#define XGBOOST_VER_PATCH 2
#endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -34,9 +34,9 @@ TO_VERSION=$2
sed_i() {
perl -p -000 -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
}
export -f sed_i
BASEDIR=$(dirname $0)/..
find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
-exec bash -c \

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
<packaging>pom</packaging>
<name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j-example_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
<packaging>jar</packaging>
<build>
<plugins>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
<build>
<plugins>
<plugin>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j-spark_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
</parent>
<artifactId>xgboost4j_2.12</artifactId>
<version>1.3.0</version>
<version>1.3.2</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -1 +1 @@
1.3.0
1.3.2

View File

@@ -456,6 +456,7 @@ class LearningRateScheduler(TrainingCallback):
def after_iteration(self, model, epoch, evals_log):
model.set_param('learning_rate', self.learning_rates(epoch))
return False
# pylint: disable=too-many-instance-attributes
@@ -565,7 +566,7 @@ class EarlyStopping(TrainingCallback):
def after_training(self, model: Booster):
try:
if self.save_best:
model = model[: int(model.attr('best_iteration'))]
model = model[: int(model.attr('best_iteration')) + 1]
except XGBoostError as e:
raise XGBoostError('`save_best` is not applicable to current booster') from e
return model
@@ -621,7 +622,7 @@ class EvaluationMonitor(TrainingCallback):
msg += self._fmt_metric(data, metric_name, score, stdv)
msg += '\n'
if (epoch % self.period) != 0 or self.period == 1:
if (epoch % self.period) == 0 or self.period == 1:
rabit.tracker_print(msg)
self._latest = None
else:
@@ -677,6 +678,7 @@ class TrainingCheckPoint(TrainingCallback):
else:
model.save_model(path)
self._epoch += 1
return False
class LegacyCallbacks:

View File

@@ -1,11 +1,12 @@
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-lines, too-many-locals
# pylint: disable=too-many-lines, too-many-locals, no-self-use
"""Core XGBoost Library."""
import collections
# pylint: disable=no-name-in-module,import-error
from collections.abc import Mapping
# pylint: enable=no-name-in-module,import-error
from typing import Dict, Union, List
import ctypes
import os
import re
@@ -1012,6 +1013,7 @@ class Booster(object):
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
ctypes.byref(self.handle)))
params = params or {}
params = self._configure_metrics(params.copy())
if isinstance(params, list):
params.append(('validate_parameters', True))
else:
@@ -1041,6 +1043,17 @@ class Booster(object):
else:
raise TypeError('Unknown type:', model_file)
def _configure_metrics(self, params: Union[Dict, List]) -> Union[Dict, List]:
if isinstance(params, dict) and 'eval_metric' in params \
and isinstance(params['eval_metric'], list):
params = dict((k, v) for k, v in params.items())
eval_metrics = params['eval_metric']
params.pop("eval_metric", None)
params = list(params.items())
for eval_metric in eval_metrics:
params += [('eval_metric', eval_metric)]
return params
def __del__(self):
if hasattr(self, 'handle') and self.handle is not None:
_check_call(_LIB.XGBoosterFree(self.handle))

View File

@@ -1210,10 +1210,10 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
early_stopping_rounds=early_stopping_rounds,
verbose=verbose)
async def _predict_proba_async(self, data, output_margin=False,
async def _predict_proba_async(self, X, output_margin=False,
base_margin=None):
test_dmatrix = await DaskDMatrix(
client=self.client, data=data, base_margin=base_margin,
client=self.client, data=X, base_margin=base_margin,
missing=self.missing
)
pred_probs = await predict(client=self.client,
@@ -1223,11 +1223,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
return pred_probs
# pylint: disable=arguments-differ,missing-docstring
def predict_proba(self, data, output_margin=False, base_margin=None):
def predict_proba(self, X, output_margin=False, base_margin=None):
_assert_dask_support()
return self.client.sync(
self._predict_proba_async,
data,
X=X,
output_margin=output_margin,
base_margin=base_margin
)

View File

@@ -4,6 +4,7 @@
import copy
import warnings
import json
from typing import Optional
import numpy as np
from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
from .training import train
@@ -494,6 +495,13 @@ class XGBModel(XGBModelBase):
# Delete the attribute after load
self.get_booster().set_attr(scikit_learn=None)
def _set_evaluation_result(self, evals_result: Optional[dict]) -> None:
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
@_deprecate_positional_args
def fit(self, X, y, *, sample_weight=None, base_margin=None,
eval_set=None, eval_metric=None, early_stopping_rounds=None,
@@ -565,13 +573,6 @@ class XGBModel(XGBModelBase):
"""
self.n_features_in_ = X.shape[1]
train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
base_margin=base_margin,
missing=self.missing,
nthread=self.n_jobs)
train_dmatrix.set_info(feature_weights=feature_weights)
evals_result = {}
train_dmatrix, evals = self._wrap_evaluation_matrices(
@@ -601,12 +602,7 @@ class XGBModel(XGBModelBase):
verbose_eval=verbose, xgb_model=xgb_model,
callbacks=callbacks)
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][
evals_result_key]
self.evals_result_ = evals_result
self._set_evaluation_result(evals_result)
if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score
@@ -841,14 +837,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
self.classes_ = cp.unique(y.values)
self.n_classes_ = len(self.classes_)
can_use_label_encoder = False
if not cp.array_equal(self.classes_, cp.arange(self.n_classes_)):
expected_classes = cp.arange(self.n_classes_)
if (self.classes_.shape != expected_classes.shape or
not (self.classes_ == expected_classes).all()):
raise ValueError(label_encoding_check_error)
elif _is_cupy_array(y):
import cupy as cp # pylint: disable=E0401
self.classes_ = cp.unique(y)
self.n_classes_ = len(self.classes_)
can_use_label_encoder = False
if not cp.array_equal(self.classes_, cp.arange(self.n_classes_)):
expected_classes = cp.arange(self.n_classes_)
if (self.classes_.shape != expected_classes.shape or
not (self.classes_ == expected_classes).all()):
raise ValueError(label_encoding_check_error)
else:
self.classes_ = np.unique(y)
@@ -915,12 +915,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
callbacks=callbacks)
self.objective = xgb_options["objective"]
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][
evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
self._set_evaluation_result(evals_result)
if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score
@@ -991,10 +986,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
return self._le.inverse_transform(column_indexes)
return column_indexes
def predict_proba(self, data, ntree_limit=None, validate_features=False,
def predict_proba(self, X, ntree_limit=None, validate_features=False,
base_margin=None):
"""
Predict the probability of each `data` example being of a given class.
""" Predict the probability of each `X` example being of a given class.
.. note:: This function is not thread safe
@@ -1004,21 +998,22 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Parameters
----------
data : array_like
X : array_like
Feature matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to best_ntree_limit if defined
(i.e. it has been trained with early stopping), otherwise 0 (use all trees).
Limit number of trees in the prediction; defaults to best_ntree_limit if
defined (i.e. it has been trained with early stopping), otherwise 0 (use all
trees).
validate_features : bool
When this is True, validate that the Booster's and data's feature_names are identical.
Otherwise, it is assumed that the feature_names are the same.
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
Returns
-------
prediction : numpy array
a numpy array with the probability of each data example being of a given class.
"""
test_dmatrix = DMatrix(data, base_margin=base_margin,
test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0)
@@ -1324,12 +1319,7 @@ class XGBRanker(XGBModel):
self.objective = params["objective"]
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result = evals_result
self._set_evaluation_result(evals_result)
if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score
self.best_iteration = self._Booster.best_iteration

View File

@@ -4,6 +4,7 @@
"""Training Library containing training routines."""
import warnings
import copy
import json
import numpy as np
from .core import Booster, XGBoostError
@@ -40,18 +41,6 @@ def _is_new_callback(callbacks):
for c in callbacks) or not callbacks
def _configure_metrics(params):
if isinstance(params, dict) and 'eval_metric' in params \
and isinstance(params['eval_metric'], list):
params = dict((k, v) for k, v in params.items())
eval_metrics = params['eval_metric']
params.pop("eval_metric", None)
params = list(params.items())
for eval_metric in eval_metrics:
params += [('eval_metric', eval_metric)]
return params
def _train_internal(params, dtrain,
num_boost_round=10, evals=(),
obj=None, feval=None,
@@ -61,7 +50,6 @@ def _train_internal(params, dtrain,
"""internal training function"""
callbacks = [] if callbacks is None else copy.copy(callbacks)
evals = list(evals)
params = _configure_metrics(params.copy())
bst = Booster(params, [dtrain] + [d[0] for d in evals])
nboost = 0
@@ -136,7 +124,28 @@ def _train_internal(params, dtrain,
bst.best_iteration = int(bst.attr('best_iteration'))
else:
bst.best_iteration = nboost - 1
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
config = json.loads(bst.save_config())
booster = config['learner']['gradient_booster']['name']
if booster == 'gblinear':
num_parallel_tree = 0
elif booster == 'dart':
num_parallel_tree = int(
config['learner']['gradient_booster']['gbtree']['gbtree_train_param'][
'num_parallel_tree'
]
)
elif booster == 'gbtree':
num_parallel_tree = int(
config['learner']['gradient_booster']['gbtree_train_param'][
'num_parallel_tree']
)
else:
raise ValueError(f'Unknown booster: {booster}')
num_groups = int(config['learner']['learner_model_param']['num_class'])
num_groups = 1 if num_groups == 0 else num_groups
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups
# Copy to serialise and unserialise booster to reset state and free
# training memory
return bst.copy()

View File

@@ -25,6 +25,10 @@
#include <sys/socket.h>
#include <sys/ioctl.h>
#if defined(__sun) || defined(sun)
#include <sys/sockio.h>
#endif // defined(__sun) || defined(sun)
#endif // defined(_WIN32)
#include <string>

View File

@@ -162,6 +162,9 @@ struct LogisticRaw : public LogisticRegression {
predt = common::Sigmoid(predt);
return std::max(predt * (T(1.0f) - predt), eps);
}
static bst_float ProbToMargin(bst_float base_score) {
return base_score;
}
static const char* DefaultEvalMetric() { return "auc"; }
static const char* Name() { return "binary:logitraw"; }

View File

@@ -0,0 +1,15 @@
FROM quay.io/pypa/manylinux2010_x86_64
# Install lightweight sudo (not bound to TTY)
ENV GOSU_VERSION 1.10
RUN set -ex; \
curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]

View File

@@ -9,7 +9,6 @@ dependencies:
- scikit-learn
- pandas
- pytest
- python-graphviz
- boto3
- hypothesis
- jsonschema
@@ -17,3 +16,4 @@ dependencies:
- pip:
- cupy-cuda101
- modin[all]
- graphviz

View File

@@ -5,8 +5,10 @@ import numpy as np
import asyncio
import xgboost
import subprocess
import hypothesis
from hypothesis import given, strategies, settings, note
from hypothesis._settings import duration
from hypothesis import HealthCheck
from test_gpu_updaters import parameter_strategy
if sys.platform.startswith("win"):
@@ -19,6 +21,11 @@ from test_with_dask import _get_client_workers # noqa
from test_with_dask import generate_array # noqa
import testing as tm # noqa
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try:
import dask.dataframe as dd
@@ -161,19 +168,24 @@ class TestDistributedGPU:
run_with_dask_dataframe(dxgb.DaskDMatrix, client)
run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
@given(params=parameter_strategy, num_rounds=strategies.integers(1, 20),
dataset=tm.dataset_strategy)
@settings(deadline=duration(seconds=120))
@given(
params=parameter_strategy,
num_rounds=strategies.integers(1, 20),
dataset=tm.dataset_strategy,
)
@settings(deadline=duration(seconds=120), suppress_health_check=suppress)
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster'])
@pytest.mark.parametrize(
"local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
)
@pytest.mark.mgpu
def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster):
with Client(local_cuda_cluster) as client:
run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
client)
run_gpu_hist(params, num_rounds, dataset,
dxgb.DaskDeviceQuantileDMatrix, client)
run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client)
run_gpu_hist(
params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client
)
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_dask())

View File

@@ -64,22 +64,24 @@ def generate_logistic_model():
y = np.random.randint(0, 2, size=kRows)
assert y.max() == 1 and y.min() == 0
data = xgboost.DMatrix(X, label=y, weight=w)
booster = xgboost.train({'tree_method': 'hist',
'num_parallel_tree': kForests,
'max_depth': kMaxDepth,
'objective': 'binary:logistic'},
num_boost_round=kRounds, dtrain=data)
booster.save_model(booster_bin('logit'))
booster.save_model(booster_json('logit'))
for objective, name in [('binary:logistic', 'logit'), ('binary:logitraw', 'logitraw')]:
data = xgboost.DMatrix(X, label=y, weight=w)
booster = xgboost.train({'tree_method': 'hist',
'num_parallel_tree': kForests,
'max_depth': kMaxDepth,
'objective': objective},
num_boost_round=kRounds, dtrain=data)
booster.save_model(booster_bin(name))
booster.save_model(booster_json(name))
reg = xgboost.XGBClassifier(tree_method='hist',
num_parallel_tree=kForests,
max_depth=kMaxDepth,
n_estimators=kRounds)
reg.fit(X, y, w)
reg.save_model(skl_bin('logit'))
reg.save_model(skl_json('logit'))
reg = xgboost.XGBClassifier(tree_method='hist',
num_parallel_tree=kForests,
max_depth=kMaxDepth,
n_estimators=kRounds,
objective=objective)
reg.fit(X, y, w)
reg.save_model(skl_bin(name))
reg.save_model(skl_json(name))
def generate_classification_model():

View File

@@ -57,6 +57,25 @@ class TestBasic:
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
def test_metric_config(self):
# Make sure that the metric configuration happens in booster so the
# string `['error', 'auc']` doesn't get passed down to core.
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
booster = xgb.train(param, dtrain, num_round, watchlist)
predt_0 = booster.predict(dtrain)
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, 'model.json')
booster.save_model(path)
booster = xgb.Booster(params=param, model_file=path)
predt_1 = booster.predict(dtrain)
np.testing.assert_allclose(predt_0, predt_1)
def test_record_results(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
@@ -124,8 +143,8 @@ class TestBasic:
dump2 = bst.get_dump(with_stats=True)
assert dump2[0].count('\n') == 3, 'Expected 1 root and 2 leaves - 3 lines in dump.'
assert (dump2[0].find('\n') > dump1[0].find('\n'),
'Expected more info when with_stats=True is given.')
msg = 'Expected more info when with_stats=True is given.'
assert dump2[0].find('\n') > dump1[0].find('\n'), msg
dump3 = bst.get_dump(dump_format="json")
dump3j = json.loads(dump3[0])
@@ -248,13 +267,11 @@ class TestBasicPathLike:
assert binary_path.exists()
Path.unlink(binary_path)
def test_Booster_init_invalid_path(self):
"""An invalid model_file path should raise XGBoostError."""
with pytest.raises(xgb.core.XGBoostError):
xgb.Booster(model_file=Path("invalidpath"))
def test_Booster_save_and_load(self):
"""Saving and loading model files from paths."""
save_path = Path("saveload.model")

View File

@@ -33,15 +33,18 @@ class TestCallbacks:
verbose_eval=verbose_eval)
output: str = out.getvalue().strip()
pos = 0
msg = 'Train-error'
for i in range(rounds // int(verbose_eval)):
pos = output.find('Train-error', pos)
assert pos != -1
pos += len(msg)
assert output.find('Train-error', pos) == -1
if int(verbose_eval) == 1:
# Should print each iteration info
assert len(output.split('\n')) == rounds
elif int(verbose_eval) > rounds:
# Should print first and latest iteration info
assert len(output.split('\n')) == 2
else:
# Should print info by each period additionaly to first and latest iteration
num_periods = rounds // int(verbose_eval)
# Extra information is required for latest iteration
is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
assert len(output.split('\n')) == 1 + num_periods + int(is_extra_info_required)
def test_evaluation_monitor(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
@@ -57,8 +60,10 @@ class TestCallbacks:
assert len(evals_result['Train']['error']) == rounds
assert len(evals_result['Valid']['error']) == rounds
self.run_evaluation_monitor(D_train, D_valid, rounds, 2)
self.run_evaluation_monitor(D_train, D_valid, rounds, True)
self.run_evaluation_monitor(D_train, D_valid, rounds, 2)
self.run_evaluation_monitor(D_train, D_valid, rounds, 4)
self.run_evaluation_monitor(D_train, D_valid, rounds, rounds + 1)
def test_early_stopping(self):
D_train = xgb.DMatrix(self.X_train, self.y_train)
@@ -148,7 +153,7 @@ class TestCallbacks:
eval_metric=tm.eval_error_metric, callbacks=[early_stop])
booster = cls.get_booster()
dump = booster.get_dump(dump_format='json')
assert len(dump) == booster.best_iteration
assert len(dump) == booster.best_iteration + 1
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
save_best=True)

View File

@@ -24,6 +24,10 @@ def run_booster_check(booster, name):
config['learner']['learner_model_param']['base_score']) == 0.5
assert config['learner']['learner_train_param'][
'objective'] == 'multi:softmax'
elif name.find('logitraw') != -1:
assert len(booster.get_dump()) == gm.kForests * gm.kRounds
assert config['learner']['learner_model_param']['num_class'] == str(0)
assert config['learner']['learner_train_param']['objective'] == 'binary:logitraw'
elif name.find('logit') != -1:
assert len(booster.get_dump()) == gm.kForests * gm.kRounds
assert config['learner']['learner_model_param']['num_class'] == str(0)
@@ -77,6 +81,13 @@ def run_scikit_model_check(name, path):
assert config['learner']['learner_train_param'][
'objective'] == 'rank:ndcg'
run_model_param_check(config)
elif name.find('logitraw') != -1:
logit = xgboost.XGBClassifier()
logit.load_model(path)
assert (len(logit.get_booster().get_dump()) ==
gm.kRounds * gm.kForests)
config = json.loads(logit.get_booster().save_config())
assert config['learner']['learner_train_param']['objective'] == 'binary:logitraw'
elif name.find('logit') != -1:
logit = xgboost.XGBClassifier()
logit.load_model(path)

View File

@@ -123,13 +123,13 @@ class TestTrainingContinuation:
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
num_boost_round=7)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5
gbdt_05 = xgb.train(xgb_params_03,
dtrain_5class,
num_boost_round=3,
xgb_model=gbdt_05)
assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5
res1 = gbdt_05.predict(dtrain_5class)
res2 = gbdt_05.predict(dtrain_5class,

View File

@@ -8,7 +8,8 @@ import asyncio
from sklearn.datasets import make_classification
import os
import subprocess
from hypothesis import given, settings, note
import hypothesis
from hypothesis import given, settings, note, HealthCheck
from test_updaters import hist_parameter_strategy, exact_parameter_strategy
if sys.platform.startswith("win"):
@@ -17,6 +18,12 @@ if tm.no_dask()['condition']:
pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True)
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try:
from distributed import LocalCluster, Client, get_client
from distributed.utils_test import client, loop, cluster_fixture
@@ -668,14 +675,14 @@ class TestWithDask:
@given(params=hist_parameter_strategy,
dataset=tm.dataset_strategy)
@settings(deadline=None)
@settings(deadline=None, suppress_health_check=suppress)
def test_hist(self, params, dataset, client):
num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'hist')
@given(params=exact_parameter_strategy,
dataset=tm.dataset_strategy)
@settings(deadline=None)
@settings(deadline=None, suppress_health_check=suppress)
def test_approx(self, client, params, dataset):
num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'approx')
@@ -795,7 +802,6 @@ class TestDaskCallbacks:
merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
assert len(merged) == 2
def test_data_initialization(self):
'''Assert each worker has the correct amount of data, and DMatrix initialization doesn't
generate unnecessary copies of data.

View File

@@ -78,6 +78,34 @@ def test_multiclass_classification():
check_pred(preds4, labels, output_margin=False)
def test_best_ntree_limit():
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
def train(booster, forest):
rounds = 4
cls = xgb.XGBClassifier(
n_estimators=rounds, num_parallel_tree=forest, booster=booster
).fit(
X, y, eval_set=[(X, y)], early_stopping_rounds=3
)
if forest:
assert cls.best_ntree_limit == rounds * forest * cls.n_classes_
else:
assert cls.best_ntree_limit == 0
# best_ntree_limit is used by default, assert that under gblinear it's
# automatically ignored due to being 0.
cls.predict(X)
num_parallel_tree = 4
train('gbtree', num_parallel_tree)
train('dart', num_parallel_tree)
train('gblinear', None)
def test_ranking():
# generate random data
x_train = np.random.rand(1000, 10)
@@ -94,6 +122,8 @@ def test_ranking():
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, group=train_group,
eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
assert model.evals_result()
pred = model.predict(x_test)
train_data = xgb.DMatrix(x_train, y_train)