Compare commits
14 Commits
master-roc
...
release_0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6852d0afd5 | ||
|
|
c0bd296354 | ||
|
|
09142c94f5 | ||
|
|
ba4244ef51 | ||
|
|
a46b0ac2d2 | ||
|
|
4bc7e94603 | ||
|
|
a899e8f4cd | ||
|
|
f9a833f525 | ||
|
|
1afd2f1b2d | ||
|
|
b1d76d533d | ||
|
|
9d70655c42 | ||
|
|
dd1fda449c | ||
|
|
324f3b5259 | ||
|
|
24e08c2638 |
103
Jenkinsfile
vendored
103
Jenkinsfile
vendored
@ -3,8 +3,15 @@
|
||||
// Jenkins pipeline
|
||||
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
|
||||
|
||||
import groovy.transform.Field
|
||||
|
||||
/* Unrestricted tasks: tasks that do NOT generate artifacts */
|
||||
|
||||
// Command to run command inside a docker container
|
||||
dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
def dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
// Utility functions
|
||||
@Field
|
||||
def utils
|
||||
|
||||
def buildMatrix = [
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
|
||||
@ -26,42 +33,25 @@ pipeline {
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Get sources') {
|
||||
agent any
|
||||
stage('Jenkins: Get sources') {
|
||||
agent {
|
||||
label 'unrestricted'
|
||||
}
|
||||
steps {
|
||||
checkoutSrcs()
|
||||
script {
|
||||
utils = load('tests/ci_build/jenkins_tools.Groovy')
|
||||
utils.checkoutSrcs()
|
||||
}
|
||||
stash name: 'srcs', excludes: '.git/'
|
||||
milestone label: 'Sources ready', ordinal: 1
|
||||
}
|
||||
}
|
||||
stage('Build doc') {
|
||||
agent any
|
||||
steps {
|
||||
script {
|
||||
if (env.CHANGE_ID == null) { // This is a branch
|
||||
def commit_id = "${GIT_COMMIT}"
|
||||
def branch_name = "${GIT_LOCAL_BRANCH}"
|
||||
echo 'Building doc...'
|
||||
dir ('jvm-packages') {
|
||||
sh "bash ./build_doc.sh ${commit_id}"
|
||||
archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
|
||||
echo 'Deploying doc...'
|
||||
withAWS(credentials:'xgboost-doc-bucket') {
|
||||
s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
|
||||
}
|
||||
}
|
||||
} else { // This is a pull request
|
||||
echo 'Skipping doc build step for pull request'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Build & Test') {
|
||||
stage('Jenkins: Build & Test') {
|
||||
steps {
|
||||
script {
|
||||
parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
|
||||
def buildName = getBuildName(c)
|
||||
buildFactory(buildName, c)
|
||||
def buildName = utils.getBuildName(c)
|
||||
utils.buildFactory(buildName, c, false, this.&buildPlatformCmake)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -69,37 +59,11 @@ pipeline {
|
||||
}
|
||||
}
|
||||
|
||||
// initialize source codes
|
||||
def checkoutSrcs() {
|
||||
retry(5) {
|
||||
try {
|
||||
timeout(time: 2, unit: 'MINUTES') {
|
||||
checkout scm
|
||||
sh 'git submodule update --init'
|
||||
}
|
||||
} catch (exc) {
|
||||
deleteDir()
|
||||
error "Failed to fetch source codes"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates cmake and make builds
|
||||
*/
|
||||
def buildFactory(buildName, conf) {
|
||||
def os = conf["os"]
|
||||
def nodeReq = conf["withGpu"] ? "${os} && gpu" : "${os}"
|
||||
def dockerTarget = conf["withGpu"] ? "gpu" : "cpu"
|
||||
[ ("${buildName}") : { buildPlatformCmake("${buildName}", conf, nodeReq, dockerTarget) }
|
||||
]
|
||||
}
|
||||
|
||||
/**
|
||||
* Build platform and test it via cmake.
|
||||
*/
|
||||
def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
def opts = cmakeOptions(conf)
|
||||
def opts = utils.cmakeOptions(conf)
|
||||
// Destination dir for artifacts
|
||||
def distDir = "dist/${buildName}"
|
||||
def dockerArgs = ""
|
||||
@ -119,33 +83,6 @@ def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
sh """
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/test_${dockerTarget}.sh
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
|
||||
rm -rf "${distDir}"; mkdir -p "${distDir}/py"
|
||||
cp xgboost "${distDir}"
|
||||
cp -r lib "${distDir}"
|
||||
cp -r python-package/dist "${distDir}/py"
|
||||
# Test the wheel for compatibility on a barebones CPU container
|
||||
${dockerRun} release ${dockerArgs} bash -c " \
|
||||
auditwheel show xgboost-*-py2-none-any.whl
|
||||
pip install --user python-package/dist/xgboost-*-none-any.whl && \
|
||||
python -m nose tests/python"
|
||||
"""
|
||||
archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
|
||||
}
|
||||
}
|
||||
|
||||
def cmakeOptions(conf) {
|
||||
return ([
|
||||
conf["withGpu"] ? '-DUSE_CUDA=ON' : '-DUSE_CUDA=OFF',
|
||||
conf["withNccl"] ? '-DUSE_NCCL=ON' : '-DUSE_NCCL=OFF',
|
||||
conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : '']
|
||||
).join(" ")
|
||||
}
|
||||
|
||||
def getBuildName(conf) {
|
||||
def gpuLabel = conf['withGpu'] ? ("_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu"
|
||||
def ompLabel = conf['withOmp'] ? "_omp" : ""
|
||||
def pyLabel = "_py${conf['pythonVersion']}"
|
||||
return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"
|
||||
}
|
||||
|
||||
|
||||
121
Jenkinsfile-restricted
Normal file
121
Jenkinsfile-restricted
Normal file
@ -0,0 +1,121 @@
|
||||
#!/usr/bin/groovy
|
||||
// -*- mode: groovy -*-
|
||||
// Jenkins pipeline
|
||||
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
|
||||
|
||||
import groovy.transform.Field
|
||||
|
||||
/* Restricted tasks: tasks generating artifacts, such as binary wheels and
|
||||
documentation */
|
||||
|
||||
// Command to run command inside a docker container
|
||||
def dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
// Utility functions
|
||||
@Field
|
||||
def utils
|
||||
|
||||
def buildMatrix = [
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
]
|
||||
|
||||
pipeline {
|
||||
// Each stage specify its own agent
|
||||
agent none
|
||||
|
||||
// Setup common job properties
|
||||
options {
|
||||
ansiColor('xterm')
|
||||
timestamps()
|
||||
timeout(time: 120, unit: 'MINUTES')
|
||||
buildDiscarder(logRotator(numToKeepStr: '10'))
|
||||
}
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Jenkins: Get sources') {
|
||||
agent {
|
||||
label 'restricted'
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
utils = load('tests/ci_build/jenkins_tools.Groovy')
|
||||
utils.checkoutSrcs()
|
||||
}
|
||||
stash name: 'srcs', excludes: '.git/'
|
||||
milestone label: 'Sources ready', ordinal: 1
|
||||
}
|
||||
}
|
||||
stage('Jenkins: Build doc') {
|
||||
agent {
|
||||
label 'linux && cpu && restricted'
|
||||
}
|
||||
steps {
|
||||
unstash name: 'srcs'
|
||||
script {
|
||||
def commit_id = "${GIT_COMMIT}"
|
||||
def branch_name = "${GIT_LOCAL_BRANCH}"
|
||||
echo 'Building doc...'
|
||||
dir ('jvm-packages') {
|
||||
sh "bash ./build_doc.sh ${commit_id}"
|
||||
archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
|
||||
echo 'Deploying doc...'
|
||||
withAWS(credentials:'xgboost-doc-bucket') {
|
||||
s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Jenkins: Build artifacts') {
|
||||
steps {
|
||||
script {
|
||||
parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
|
||||
def buildName = utils.getBuildName(c)
|
||||
utils.buildFactory(buildName, c, true, this.&buildPlatformCmake)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build platform and test it via cmake.
|
||||
*/
|
||||
def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
def opts = utils.cmakeOptions(conf)
|
||||
// Destination dir for artifacts
|
||||
def distDir = "dist/${buildName}"
|
||||
def dockerArgs = ""
|
||||
if(conf["withGpu"]){
|
||||
dockerArgs = "--build-arg CUDA_VERSION=" + conf["cudaVersion"]
|
||||
}
|
||||
// Build node - this is returned result
|
||||
node(nodeReq) {
|
||||
unstash name: 'srcs'
|
||||
echo """
|
||||
|===== XGBoost CMake build =====
|
||||
| dockerTarget: ${dockerTarget}
|
||||
| cmakeOpts : ${opts}
|
||||
|=========================
|
||||
""".stripMargin('|')
|
||||
// Invoke command inside docker
|
||||
sh """
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
|
||||
rm -rf "${distDir}"; mkdir -p "${distDir}/py"
|
||||
cp xgboost "${distDir}"
|
||||
cp -r lib "${distDir}"
|
||||
cp -r python-package/dist "${distDir}/py"
|
||||
# Test the wheel for compatibility on a barebones CPU container
|
||||
${dockerRun} release ${dockerArgs} bash -c " \
|
||||
auditwheel show xgboost-*-py2-none-any.whl
|
||||
pip install --user python-package/dist/xgboost-*-none-any.whl && \
|
||||
python -m nose tests/python"
|
||||
"""
|
||||
archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
|
||||
}
|
||||
}
|
||||
@ -168,7 +168,7 @@ cb.evaluation.log <- function() {
|
||||
#' at the beginning of each iteration.
|
||||
#'
|
||||
#' Note that when training is resumed from some previous model, and a function is used to
|
||||
#' reset a parameter value, the \code{nround} argument in this function would be the
|
||||
#' reset a parameter value, the \code{nrounds} argument in this function would be the
|
||||
#' the number of boosting rounds in the current training.
|
||||
#'
|
||||
#' Callback function expects the following values to be set in its calling frame:
|
||||
|
||||
@ -52,9 +52,9 @@
|
||||
#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
#'
|
||||
#' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
#' nround = 4
|
||||
#' nrounds = 4
|
||||
#'
|
||||
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||
#' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
#'
|
||||
#' # Model accuracy without new features
|
||||
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||
@ -68,7 +68,7 @@
|
||||
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||
#' watchlist <- list(train = new.dtrain)
|
||||
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
#'
|
||||
#' # Model accuracy with new features
|
||||
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
#' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1
|
||||
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||
#' \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
|
||||
|
||||
@ -5,20 +5,20 @@ data(agaricus.test, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nround <- 2
|
||||
nrounds <- 2
|
||||
param <- list(max_depth=2, eta=1, silent=1, nthread=2, objective='binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
|
||||
xgb.cv(param, dtrain, nrounds, nfold=5, metrics={'error'})
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nround, nfold=5,
|
||||
xgb.cv(param, dtrain, nrounds, nfold=5,
|
||||
metrics='error', showsd = FALSE)
|
||||
|
||||
###
|
||||
@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) {
|
||||
param <- list(max_depth=2, eta=1, silent=1,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
# train with customized objective
|
||||
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
|
||||
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)
|
||||
|
||||
# do cross validation with prediction values for each fold
|
||||
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
|
||||
res <- xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5, prediction = TRUE)
|
||||
res$evaluation_log
|
||||
length(res$pred)
|
||||
|
||||
@ -7,10 +7,10 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
nround = 2
|
||||
nrounds = 2
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
|
||||
bst = xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
labels <- getinfo(dtest,'label')
|
||||
|
||||
|
||||
@ -11,10 +11,10 @@ dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
nround = 4
|
||||
nrounds = 4
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||
@ -43,7 +43,7 @@ new.features.test <- create.new.tree.features(bst, agaricus.test$data)
|
||||
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||
watchlist <- list(train = new.dtrain)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy with new features
|
||||
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||
|
||||
@ -22,7 +22,7 @@ This is a "pre-iteration" callback function used to reset booster's parameters
|
||||
at the beginning of each iteration.
|
||||
|
||||
Note that when training is resumed from some previous model, and a function is used to
|
||||
reset a parameter value, the \code{nround} argument in this function would be the
|
||||
reset a parameter value, the \code{nrounds} argument in this function would be the
|
||||
the number of boosting rounds in the current training.
|
||||
|
||||
Callback function expects the following values to be set in its calling frame:
|
||||
|
||||
@ -63,9 +63,9 @@ dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
nround = 4
|
||||
nrounds = 4
|
||||
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||
@ -79,7 +79,7 @@ new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
|
||||
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||
watchlist <- list(train = new.dtrain)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy with new features
|
||||
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
|
||||
|
||||
@ -35,7 +35,7 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
\item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1
|
||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||
\item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.
|
||||
|
||||
@ -12,7 +12,7 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
|
||||
# disable the use of thread_local for 32 bit windows:
|
||||
ifeq ($(R_OSTYPE)$(WIN),windows)
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0 -msse2 -mfpmath=sse
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
|
||||
endif
|
||||
$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
|
||||
|
||||
|
||||
@ -24,7 +24,7 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
|
||||
# disable the use of thread_local for 32 bit windows:
|
||||
ifeq ($(R_OSTYPE)$(WIN),windows)
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0 -msse2 -mfpmath=sse
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
|
||||
endif
|
||||
$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ test_that("train and prediction when gctorture is on", {
|
||||
test <- agaricus.test
|
||||
gctorture(TRUE)
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
pred <- predict(bst, test$data)
|
||||
gctorture(FALSE)
|
||||
})
|
||||
|
||||
@ -7,6 +7,9 @@ require(vcd, quietly = TRUE)
|
||||
|
||||
float_tolerance = 5e-6
|
||||
|
||||
# disable some tests for Win32
|
||||
win32_flag = .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
|
||||
|
||||
set.seed(1982)
|
||||
data(Arthritis)
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
@ -41,7 +44,8 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
|
||||
|
||||
|
||||
test_that("xgb.dump works", {
|
||||
expect_length(xgb.dump(bst.Tree), 200)
|
||||
if (!win32_flag)
|
||||
expect_length(xgb.dump(bst.Tree), 200)
|
||||
dump_file = file.path(tempdir(), 'xgb.model.dump')
|
||||
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = T))
|
||||
expect_true(file.exists(dump_file))
|
||||
@ -50,7 +54,8 @@ test_that("xgb.dump works", {
|
||||
# JSON format
|
||||
dmp <- xgb.dump(bst.Tree, dump_format = "json")
|
||||
expect_length(dmp, 1)
|
||||
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
|
||||
if (!win32_flag)
|
||||
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
|
||||
})
|
||||
|
||||
test_that("xgb.dump works for gblinear", {
|
||||
@ -210,7 +215,8 @@ test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||
expect_equal(names.dt.trees, names(dt.tree))
|
||||
expect_equal(dim(dt.tree), c(188, 10))
|
||||
if (!win32_flag)
|
||||
expect_equal(dim(dt.tree), c(188, 10))
|
||||
expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
|
||||
|
||||
dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
|
||||
@ -236,7 +242,8 @@ test_that("xgb.model.dt.tree throws error for gblinear", {
|
||||
|
||||
test_that("xgb.importance works with and without feature names", {
|
||||
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
|
||||
expect_equal(dim(importance.Tree), c(7, 4))
|
||||
if (!win32_flag)
|
||||
expect_equal(dim(importance.Tree), c(7, 4))
|
||||
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
||||
expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
|
||||
|
||||
|
||||
@ -7,6 +7,10 @@ data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
# Disable flaky tests for 32-bit Windows.
|
||||
# See https://github.com/dmlc/xgboost/issues/3720
|
||||
win32_flag = .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
|
||||
|
||||
test_that("updating the model works", {
|
||||
watchlist = list(train = dtrain, test = dtest)
|
||||
|
||||
@ -29,7 +33,9 @@ test_that("updating the model works", {
|
||||
tr1r <- xgb.model.dt.tree(model = bst1r)
|
||||
# all should be the same when no subsampling
|
||||
expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
|
||||
expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
|
||||
}
|
||||
|
||||
# the same boosting with subsampling with an extra 'refresh' updater:
|
||||
p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
|
||||
@ -38,7 +44,9 @@ test_that("updating the model works", {
|
||||
tr2r <- xgb.model.dt.tree(model = bst2r)
|
||||
# should be the same evaluation but different gains and larger cover
|
||||
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
|
||||
expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality)
|
||||
}
|
||||
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2r[Feature != 'Leaf']$Quality)), 100)
|
||||
expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
|
||||
|
||||
@ -61,7 +69,9 @@ test_that("updating the model works", {
|
||||
expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
|
||||
# the results should be the same as for the model with an extra 'refresh' updater
|
||||
expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
|
||||
expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
|
||||
}
|
||||
|
||||
# process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
|
||||
p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
||||
|
||||
@ -80,12 +80,6 @@ booster = gblinear
|
||||
# L2 regularization term on weights, default 0
|
||||
lambda = 0.01
|
||||
# L1 regularization term on weights, default 0
|
||||
If ```agaricus.txt.test.buffer``` exists, and automatically loads from binary buffer if possible, this can speedup training process when you do training many times. You can disable it by setting ```use_buffer=0```.
|
||||
- Buffer file can also be used as standalone input, i.e if buffer file exists, but original agaricus.txt.test was removed, xgboost will still run
|
||||
* Deviation from LibSVM input format: xgboost is compatible with LibSVM format, with the following minor differences:
|
||||
- xgboost allows feature index starts from 0
|
||||
- for binary classification, the label is 1 for positive, 0 for negative, instead of +1,-1
|
||||
- the feature indices in each line *do not* need to be sorted
|
||||
alpha = 0.01
|
||||
# L2 regularization term on bias, default 0
|
||||
lambda_bias = 0.01
|
||||
@ -102,7 +96,7 @@ After training, we can use the output model to get the prediction of the test da
|
||||
For binary classification, the output predictions are probability confidence scores in [0,1], corresponds to the probability of the label to be positive.
|
||||
|
||||
#### Dump Model
|
||||
This is a preliminary feature, so far only tree model support text dump. XGBoost can display the tree models in text files and we can scan the model in an easy way:
|
||||
This is a preliminary feature, so only tree models support text dump. XGBoost can display the tree models in text or JSON files, and we can scan the model in an easy way:
|
||||
```
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
@ -33,10 +33,10 @@ def logregobj(preds, dtrain):
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# return a pair metric_name, result. The metric name must not contain a colon (:)
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
|
||||
|
||||
@ -24,9 +24,9 @@ param <- list("objective" = "binary:logitraw",
|
||||
"silent" = 1,
|
||||
"nthread" = 16)
|
||||
watchlist <- list("train" = xgmat)
|
||||
nround = 120
|
||||
nrounds = 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst = xgb.train(param, xgmat, nround, watchlist );
|
||||
bst = xgb.train(param, xgmat, nrounds, watchlist );
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
|
||||
@ -39,9 +39,9 @@ for (i in 1:length(threads)){
|
||||
"silent" = 1,
|
||||
"nthread" = thread)
|
||||
watchlist <- list("train" = xgmat)
|
||||
nround = 120
|
||||
nrounds = 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst = xgb.train(param, xgmat, nround, watchlist );
|
||||
bst = xgb.train(param, xgmat, nrounds, watchlist );
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
|
||||
@ -23,13 +23,13 @@ param <- list("objective" = "multi:softprob",
|
||||
"nthread" = 8)
|
||||
|
||||
# Run Cross Validation
|
||||
cv.nround = 50
|
||||
cv.nrounds = 50
|
||||
bst.cv = xgb.cv(param=param, data = x[trind,], label = y,
|
||||
nfold = 3, nrounds=cv.nround)
|
||||
nfold = 3, nrounds=cv.nrounds)
|
||||
|
||||
# Train the model
|
||||
nround = 50
|
||||
bst = xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
|
||||
nrounds = 50
|
||||
bst = xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
|
||||
|
||||
# Make prediction
|
||||
pred = predict(bst,x[teind,])
|
||||
|
||||
@ -121,19 +121,19 @@ param <- list("objective" = "multi:softprob",
|
||||
"eval_metric" = "mlogloss",
|
||||
"num_class" = numberOfClasses)
|
||||
|
||||
cv.nround <- 5
|
||||
cv.nrounds <- 5
|
||||
cv.nfold <- 3
|
||||
|
||||
bst.cv = xgb.cv(param=param, data = trainMatrix, label = y,
|
||||
nfold = cv.nfold, nrounds = cv.nround)
|
||||
nfold = cv.nfold, nrounds = cv.nrounds)
|
||||
```
|
||||
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
|
||||
|
||||
Finally, we are ready to train the real model!!!
|
||||
|
||||
```{r modelTraining}
|
||||
nround = 50
|
||||
bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nround)
|
||||
nrounds = 50
|
||||
bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
|
||||
```
|
||||
|
||||
Model understanding
|
||||
@ -142,7 +142,7 @@ Model understanding
|
||||
Feature importance
|
||||
------------------
|
||||
|
||||
So far, we have built a model made of **`r nround`** trees.
|
||||
So far, we have built a model made of **`r nrounds`** trees.
|
||||
|
||||
To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products).
|
||||
|
||||
|
||||
@ -222,7 +222,7 @@ The code below is very usual. For more information, you can look at the document
|
||||
|
||||
```r
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
|
||||
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
||||
eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -244,7 +244,7 @@ A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitti
|
||||
|
||||
> Here you can see the numbers decrease until line 7 and then increase.
|
||||
>
|
||||
> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
|
||||
> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nrounds = 4`. I will let things like that because I don't really care for the purpose of this example :-)
|
||||
|
||||
Feature importance
|
||||
------------------
|
||||
@ -448,7 +448,7 @@ train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
#Random Forest™ - 1000 trees
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic")
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nrounds = 1, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -457,7 +457,7 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parall
|
||||
|
||||
```r
|
||||
#Boosting - 3 rounds
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic")
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nrounds = 3, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
@ -178,11 +178,11 @@ We will train decision tree model using the following parameters:
|
||||
* `objective = "binary:logistic"`: we will train a binary classification model ;
|
||||
* `max.deph = 2`: the trees won't be deep, because our case is very simple ;
|
||||
* `nthread = 2`: the number of cpu threads we are going to use;
|
||||
* `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
|
||||
* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
|
||||
|
||||
|
||||
```r
|
||||
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -200,7 +200,7 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
|
||||
|
||||
|
||||
```r
|
||||
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -215,7 +215,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth
|
||||
|
||||
```r
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -232,13 +232,13 @@ One of the simplest way to see the training progress is to set the `verbose` opt
|
||||
|
||||
```r
|
||||
# verbose = 0, no message
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0)
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
|
||||
```
|
||||
|
||||
|
||||
```r
|
||||
# verbose = 1, print evaluation metric
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1)
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 1)
|
||||
```
|
||||
|
||||
```
|
||||
@ -249,7 +249,7 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o
|
||||
|
||||
```r
|
||||
# verbose = 2, also print information about tree
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2)
|
||||
```
|
||||
|
||||
```
|
||||
@ -372,7 +372,7 @@ For the purpose of this example, we use `watchlist` parameter. It is a list of `
|
||||
```r
|
||||
watchlist <- list(train=dtrain, test=dtest)
|
||||
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -380,7 +380,7 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
|
||||
## [1] train-error:0.022263 test-error:0.021726
|
||||
```
|
||||
|
||||
**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
|
||||
**XGBoost** has computed at each round the same average error metric than seen above (we set `nrounds` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
|
||||
|
||||
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
||||
|
||||
@ -390,7 +390,7 @@ For a better understanding of the learning progression, you may want to have som
|
||||
|
||||
|
||||
```r
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -407,7 +407,7 @@ Until now, all the learnings we have performed were based on boosting trees. **X
|
||||
|
||||
|
||||
```r
|
||||
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
||||
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nrounds=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
@ -445,7 +445,7 @@ dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
```
|
||||
|
||||
```r
|
||||
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
|
||||
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
10
doc/conf.py
10
doc/conf.py
@ -14,6 +14,7 @@
|
||||
from subprocess import call
|
||||
from sh.contrib import git
|
||||
import urllib.request
|
||||
from urllib.error import HTTPError
|
||||
from recommonmark.parser import CommonMarkParser
|
||||
import sys
|
||||
import re
|
||||
@ -24,8 +25,11 @@ import guzzle_sphinx_theme
|
||||
git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
|
||||
git_branch = [x for x in git_branch if 'HEAD' not in x]
|
||||
print('git_branch = {}'.format(git_branch[0]))
|
||||
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
|
||||
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
|
||||
try:
|
||||
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
|
||||
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
|
||||
except HTTPError:
|
||||
print('JVM doc not found. Skipping...')
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
@ -146,7 +150,7 @@ extensions.append("guzzle_sphinx_theme")
|
||||
# Guzzle theme options (see theme.conf for more information)
|
||||
html_theme_options = {
|
||||
# Set the name of the project to appear in the sidebar
|
||||
"project_nav_name": "XGBoost"
|
||||
"project_nav_name": "XGBoost (0.80)"
|
||||
}
|
||||
|
||||
html_sidebars = {
|
||||
|
||||
@ -42,7 +42,7 @@ R
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# fit model
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# predict
|
||||
pred <- predict(bst, test$data)
|
||||
|
||||
@ -5,6 +5,12 @@ XGBoost GPU Support
|
||||
This page contains information about GPU algorithms supported in XGBoost.
|
||||
To install GPU support, checkout the :doc:`/build`.
|
||||
|
||||
.. note:: CUDA 8.0, Compute Capability 3.5 required
|
||||
|
||||
The GPU algorithms in XGBoost require a graphics card with compute capability 3.5 or higher, with
|
||||
CUDA toolkits 8.0 or later.
|
||||
(See `this list <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_ to look up compute capability of your GPU card.)
|
||||
|
||||
*********************************************
|
||||
CUDA Accelerated Tree Construction Algorithms
|
||||
*********************************************
|
||||
|
||||
@ -274,7 +274,7 @@ and then loading the model in another session:
|
||||
With regards to ML pipeline save and load, please refer the next section.
|
||||
|
||||
Interact with Other Bindings of XGBoost
|
||||
------------------------------------
|
||||
---------------------------------------
|
||||
After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
@ -119,7 +119,7 @@ Parameters for Tree Booster
|
||||
|
||||
* ``scale_pos_weight`` [default=1]
|
||||
|
||||
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See `Parameters Tuning </tutorials/param_tuning>`_ for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
|
||||
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning </tutorials/param_tuning>` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
|
||||
|
||||
* ``updater`` [default= ``grow_colmaker,prune``]
|
||||
|
||||
@ -318,10 +318,6 @@ Command Line Parameters
|
||||
***********************
|
||||
The following parameters are only used in the console version of XGBoost
|
||||
|
||||
* ``use_buffer`` [default=1]
|
||||
|
||||
- Whether to create a binary buffer from text input. Doing so normally will speed up loading times
|
||||
|
||||
* ``num_round``
|
||||
|
||||
- The number of rounds for boosting
|
||||
@ -361,6 +357,10 @@ The following parameters are only used in the console version of XGBoost
|
||||
|
||||
- Feature map, used for dumping model
|
||||
|
||||
* ``dump_format`` [default= ``text``] options: ``text``, ``json``
|
||||
|
||||
- Format of model dump file
|
||||
|
||||
* ``name_dump`` [default= ``dump.txt``]
|
||||
|
||||
- Name of model dump file
|
||||
|
||||
@ -2,6 +2,10 @@ Python API Reference
|
||||
====================
|
||||
This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
|
||||
|
||||
.. contents::
|
||||
:backlinks: none
|
||||
:local:
|
||||
|
||||
Core Data Structure
|
||||
-------------------
|
||||
.. automodule:: xgboost.core
|
||||
@ -29,9 +33,11 @@ Scikit-Learn API
|
||||
.. automodule:: xgboost.sklearn
|
||||
.. autoclass:: xgboost.XGBRegressor
|
||||
:members:
|
||||
:inherited-members:
|
||||
:show-inheritance:
|
||||
.. autoclass:: xgboost.XGBClassifier
|
||||
:members:
|
||||
:inherited-members:
|
||||
:show-inheritance:
|
||||
|
||||
Plotting API
|
||||
|
||||
@ -13,6 +13,10 @@ The external memory version takes in the following filename format:
|
||||
The ``filename`` is the normal path to libsvm file you want to load in, and ``cacheprefix`` is a
|
||||
path to a cache file that XGBoost will use for external memory cache.
|
||||
|
||||
.. note:: External memory is not available with GPU algorithms
|
||||
|
||||
External memory is not available when ``tree_method`` is set to ``gpu_exact`` or ``gpu_hist``.
|
||||
|
||||
The following code was extracted from `demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -223,7 +223,7 @@ In this equation, :math:`w_j` are independent with respect to each other, the fo
|
||||
w_j^\ast &= -\frac{G_j}{H_j+\lambda}\\
|
||||
\text{obj}^\ast &= -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
|
||||
|
||||
The last equation measures *how good* a tree structure :math:`$q(x)` is.
|
||||
The last equation measures *how good* a tree structure :math:`q(x)` is.
|
||||
|
||||
.. image:: https://raw.githubusercontent.com/dmlc/web-data/master/xgboost/model/struct_score.png
|
||||
:width: 100%
|
||||
|
||||
@ -38,6 +38,8 @@ class TrackerLogger : public BaseLogger {
|
||||
~TrackerLogger();
|
||||
};
|
||||
|
||||
// custom logging callback; disabled for R wrapper
|
||||
#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
|
||||
class LogCallbackRegistry {
|
||||
public:
|
||||
using Callback = void (*)(const char*);
|
||||
@ -52,6 +54,17 @@ class LogCallbackRegistry {
|
||||
private:
|
||||
Callback log_callback_;
|
||||
};
|
||||
#else
|
||||
class LogCallbackRegistry {
|
||||
public:
|
||||
using Callback = void (*)(const char*);
|
||||
LogCallbackRegistry() {}
|
||||
inline void Register(Callback log_callback) {}
|
||||
inline Callback Get() const {
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
using LogCallbackRegistryStore = dmlc::ThreadLocalStore<LogCallbackRegistry>;
|
||||
|
||||
|
||||
@ -1358,11 +1358,12 @@ class Booster(object):
|
||||
def get_score(self, fmap='', importance_type='weight'):
|
||||
"""Get feature importance of each feature.
|
||||
Importance type can be defined as:
|
||||
'weight' - the number of times a feature is used to split the data across all trees.
|
||||
'gain' - the average gain across all splits the feature is used in.
|
||||
'cover' - the average coverage across all splits the feature is used in.
|
||||
'total_gain' - the total gain across all splits the feature is used in.
|
||||
'total_cover' - the total coverage across all splits the feature is used in.
|
||||
|
||||
* 'weight': the number of times a feature is used to split the data across all trees.
|
||||
* 'gain': the average gain across all splits the feature is used in.
|
||||
* 'cover': the average coverage across all splits the feature is used in.
|
||||
* 'total_gain': the total gain across all splits the feature is used in.
|
||||
* 'total_cover': the total coverage across all splits the feature is used in.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -1478,6 +1479,7 @@ class Booster(object):
|
||||
|
||||
def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
|
||||
"""Get split value histogram of a feature
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature: str
|
||||
@ -1488,7 +1490,7 @@ class Booster(object):
|
||||
The maximum number of bins.
|
||||
Number of bins equals number of unique split values n_unique,
|
||||
if bins == None or bins > n_unique.
|
||||
as_pandas : bool, default True
|
||||
as_pandas: bool, default True
|
||||
Return pd.DataFrame when pandas is installed.
|
||||
If False or pandas is not installed, return numpy ndarray.
|
||||
|
||||
|
||||
@ -28,10 +28,11 @@ def plot_importance(booster, ax=None, height=0.2,
|
||||
grid : bool, Turn the axes grids on or off. Default is True (On).
|
||||
importance_type : str, default "weight"
|
||||
How the importance is calculated: either "weight", "gain", or "cover"
|
||||
"weight" is the number of times a feature appears in a tree
|
||||
"gain" is the average gain of splits which use the feature
|
||||
"cover" is the average coverage of splits which use the feature
|
||||
where coverage is defined as the number of samples affected by the split
|
||||
|
||||
* "weight" is the number of times a feature appears in a tree
|
||||
* "gain" is the average gain of splits which use the feature
|
||||
* "cover" is the average coverage of splits which use the feature
|
||||
where coverage is defined as the number of samples affected by the split
|
||||
max_num_features : int, default None
|
||||
Maximum number of top features displayed on plot. If None, all features will be displayed.
|
||||
height : float, default 0.2
|
||||
|
||||
@ -99,14 +99,16 @@ class XGBModel(XGBModelBase):
|
||||
missing : float, optional
|
||||
Value in the data which needs to be present as a missing value. If
|
||||
None, defaults to np.nan.
|
||||
**kwargs : dict, optional
|
||||
\*\*kwargs : dict, optional
|
||||
Keyword arguments for XGBoost Booster object. Full documentation of parameters can
|
||||
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md.
|
||||
Attempting to set a parameter via the constructor args and **kwargs dict simultaneously
|
||||
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
|
||||
Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously
|
||||
will result in a TypeError.
|
||||
Note:
|
||||
**kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via
|
||||
this argument will interact properly with Sklearn.
|
||||
|
||||
.. note:: \*\*kwargs unsupported by scikit-learn
|
||||
|
||||
\*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters
|
||||
passed via this argument will interact properly with scikit-learn.
|
||||
|
||||
Note
|
||||
----
|
||||
@ -217,6 +219,7 @@ class XGBModel(XGBModelBase):
|
||||
def save_model(self, fname):
|
||||
"""
|
||||
Save the model to a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : string
|
||||
@ -227,6 +230,7 @@ class XGBModel(XGBModelBase):
|
||||
def load_model(self, fname):
|
||||
"""
|
||||
Load the model from a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : string or a memory buffer
|
||||
@ -259,7 +263,7 @@ class XGBModel(XGBModelBase):
|
||||
instance weights on the i-th validation set.
|
||||
eval_metric : str, callable, optional
|
||||
If a str, should be a built-in evaluation metric to use. See
|
||||
doc/parameter.md. If callable, a custom evaluation metric. The call
|
||||
doc/parameter.rst. If callable, a custom evaluation metric. The call
|
||||
signature is func(y_predicted, y_true) where y_true will be a
|
||||
DMatrix object such that you may need to call the get_label
|
||||
method. It must return a str, value pair where the str is a name
|
||||
@ -336,6 +340,39 @@ class XGBModel(XGBModelBase):
|
||||
return self
|
||||
|
||||
def predict(self, data, output_margin=False, ntree_limit=None):
|
||||
"""
|
||||
Predict with `data`.
|
||||
|
||||
.. note:: This function is not thread safe.
|
||||
|
||||
For each booster object, predict can only be called from one thread.
|
||||
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
|
||||
of model object and then call ``predict()``.
|
||||
|
||||
.. note:: Using ``predict()`` with DART booster
|
||||
|
||||
If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
|
||||
some of the trees will be evaluated. This will produce incorrect results if ``data`` is
|
||||
not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
|
||||
a nonzero value, e.g.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DMatrix
|
||||
The dmatrix storing the input.
|
||||
output_margin : bool
|
||||
Whether to output the raw untransformed margin value.
|
||||
ntree_limit : int
|
||||
Limit number of trees in the prediction; defaults to best_ntree_limit if defined
|
||||
(i.e. it has been trained with early stopping), otherwise 0 (use all trees).
|
||||
Returns
|
||||
-------
|
||||
prediction : numpy array
|
||||
"""
|
||||
# pylint: disable=missing-docstring,invalid-name
|
||||
test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)
|
||||
# get ntree_limit to use - if none specified, default to
|
||||
@ -372,10 +409,10 @@ class XGBModel(XGBModelBase):
|
||||
def evals_result(self):
|
||||
"""Return the evaluation results.
|
||||
|
||||
If eval_set is passed to the `fit` function, you can call evals_result() to
|
||||
get evaluation results for all passed eval_sets. When eval_metric is also
|
||||
passed to the `fit` function, the evals_result will contain the eval_metrics
|
||||
passed to the `fit` function
|
||||
If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to
|
||||
get evaluation results for all passed eval_sets. When ``eval_metric`` is also
|
||||
passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics``
|
||||
passed to the ``fit`` function
|
||||
|
||||
Returns
|
||||
-------
|
||||
@ -383,20 +420,26 @@ class XGBModel(XGBModelBase):
|
||||
|
||||
Example
|
||||
-------
|
||||
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
|
||||
|
||||
clf = xgb.XGBModel(**param_dist)
|
||||
.. code-block:: python
|
||||
|
||||
clf.fit(X_train, y_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
eval_metric='logloss',
|
||||
verbose=True)
|
||||
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
|
||||
|
||||
evals_result = clf.evals_result()
|
||||
clf = xgb.XGBModel(**param_dist)
|
||||
|
||||
clf.fit(X_train, y_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
eval_metric='logloss',
|
||||
verbose=True)
|
||||
|
||||
evals_result = clf.evals_result()
|
||||
|
||||
The variable evals_result will contain:
|
||||
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
||||
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
||||
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
||||
"""
|
||||
if self.evals_result_:
|
||||
evals_result = self.evals_result_
|
||||
@ -408,9 +451,11 @@ class XGBModel(XGBModelBase):
|
||||
@property
|
||||
def feature_importances_(self):
|
||||
"""
|
||||
Feature importances property
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_importances_ : array of shape = [n_features]
|
||||
feature_importances_ : array of shape ``[n_features]``
|
||||
|
||||
"""
|
||||
b = self.get_booster()
|
||||
@ -422,9 +467,8 @@ class XGBModel(XGBModelBase):
|
||||
|
||||
class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
|
||||
__doc__ = """Implementation of the scikit-learn API for XGBoost classification.
|
||||
|
||||
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
|
||||
__doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \
|
||||
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])
|
||||
|
||||
def __init__(self, max_depth=3, learning_rate=0.1,
|
||||
n_estimators=100, silent=True,
|
||||
@ -465,7 +509,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
instance weights on the i-th validation set.
|
||||
eval_metric : str, callable, optional
|
||||
If a str, should be a built-in evaluation metric to use. See
|
||||
doc/parameter.md. If callable, a custom evaluation metric. The call
|
||||
doc/parameter.rst. If callable, a custom evaluation metric. The call
|
||||
signature is func(y_predicted, y_true) where y_true will be a
|
||||
DMatrix object such that you may need to call the get_label
|
||||
method. It must return a str, value pair where the str is a name
|
||||
@ -610,10 +654,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
def predict_proba(self, data, ntree_limit=None):
|
||||
"""
|
||||
Predict the probability of each `data` example being of a given class.
|
||||
NOTE: This function is not thread safe.
|
||||
For each booster object, predict can only be called from one thread.
|
||||
If you want to run prediction using multiple thread, call xgb.copy() to make copies
|
||||
of model object and then call predict
|
||||
|
||||
.. note:: This function is not thread safe
|
||||
|
||||
For each booster object, predict can only be called from one thread.
|
||||
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
|
||||
of model object and then call predict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DMatrix
|
||||
@ -621,6 +668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
ntree_limit : int
|
||||
Limit number of trees in the prediction; defaults to best_ntree_limit if defined
|
||||
(i.e. it has been trained with early stopping), otherwise 0 (use all trees).
|
||||
|
||||
Returns
|
||||
-------
|
||||
prediction : numpy array
|
||||
@ -652,20 +700,26 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
|
||||
Example
|
||||
-------
|
||||
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
|
||||
|
||||
clf = xgb.XGBClassifier(**param_dist)
|
||||
.. code-block:: python
|
||||
|
||||
clf.fit(X_train, y_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
eval_metric='logloss',
|
||||
verbose=True)
|
||||
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
|
||||
|
||||
evals_result = clf.evals_result()
|
||||
clf = xgb.XGBClassifier(**param_dist)
|
||||
|
||||
The variable evals_result will contain:
|
||||
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
||||
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
||||
clf.fit(X_train, y_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
eval_metric='logloss',
|
||||
verbose=True)
|
||||
|
||||
evals_result = clf.evals_result()
|
||||
|
||||
The variable ``evals_result`` will contain
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
||||
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
||||
"""
|
||||
if self.evals_result_:
|
||||
evals_result = self.evals_result_
|
||||
@ -677,5 +731,5 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
|
||||
class XGBRegressor(XGBModel, XGBRegressorBase):
|
||||
# pylint: disable=missing-docstring
|
||||
__doc__ = """Implementation of the scikit-learn API for XGBoost regression.
|
||||
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
|
||||
__doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\
|
||||
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])
|
||||
|
||||
@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
||||
and/or num_class appears in the parameters)
|
||||
evals_result: dict
|
||||
This dictionary stores the evaluation results of all the items in watchlist.
|
||||
|
||||
Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
|
||||
a parameter containing ('eval_metric': 'logloss')
|
||||
Returns: {'train': {'logloss': ['0.48253', '0.35953']},
|
||||
'eval': {'logloss': ['0.480385', '0.357756']}}
|
||||
a parameter containing ('eval_metric': 'logloss'), the **evals_result**
|
||||
returns
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
{'train': {'logloss': ['0.48253', '0.35953']},
|
||||
'eval': {'logloss': ['0.480385', '0.357756']}}
|
||||
|
||||
verbose_eval : bool or int
|
||||
Requires at least one item in evals.
|
||||
If `verbose_eval` is True then the evaluation metric on the validation set is
|
||||
If **verbose_eval** is True then the evaluation metric on the validation set is
|
||||
printed at each boosting stage.
|
||||
If `verbose_eval` is an integer then the evaluation metric on the validation set
|
||||
is printed at every given `verbose_eval` boosting stage. The last boosting stage
|
||||
/ the boosting stage found by using `early_stopping_rounds` is also printed.
|
||||
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
|
||||
If **verbose_eval** is an integer then the evaluation metric on the validation set
|
||||
is printed at every given **verbose_eval** boosting stage. The last boosting stage
|
||||
/ the boosting stage found by using **early_stopping_rounds** is also printed.
|
||||
Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric
|
||||
is printed every 4 boosting stages, instead of every boosting stage.
|
||||
learning_rates: list or function (deprecated - use callback API instead)
|
||||
List of learning rate for each boosting round
|
||||
@ -328,10 +334,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
folds : a KFold or StratifiedKFold instance or list of fold indices
|
||||
Sklearn KFolds or StratifiedKFolds object.
|
||||
Alternatively may explicitly pass sample indices for each fold.
|
||||
For `n` folds, `folds` should be a length `n` list of tuples.
|
||||
Each tuple is `(in,out)` where `in` is a list of indices to be used
|
||||
as the training samples for the `n`th fold and `out` is a list of
|
||||
indices to be used as the testing samples for the `n`th fold.
|
||||
For ``n`` folds, ``folds`` should be a length ``n`` list of tuples.
|
||||
Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used
|
||||
as the training samples for the ``n`` th fold and ``out`` is a list of
|
||||
indices to be used as the testing samples for the ``n`` th fold.
|
||||
metrics : string or list of strings
|
||||
Evaluation metrics to be watched in CV.
|
||||
obj : function
|
||||
@ -363,8 +369,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
callbacks : list of callback functions
|
||||
List of callback functions that are applied at end of each iteration.
|
||||
It is possible to use predefined callbacks by using xgb.callback module.
|
||||
Example: [xgb.callback.reset_learning_rate(custom_rates)]
|
||||
shuffle : bool
|
||||
Example:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
[xgb.callback.reset_learning_rate(custom_rates)]
|
||||
shuffle : bool
|
||||
Shuffle data before creating folds.
|
||||
|
||||
Returns
|
||||
|
||||
52
tests/ci_build/jenkins_tools.Groovy
Normal file
52
tests/ci_build/jenkins_tools.Groovy
Normal file
@ -0,0 +1,52 @@
|
||||
#!/usr/bin/groovy
|
||||
// -*- mode: groovy -*-
|
||||
|
||||
/* Utility functions for Jenkins */
|
||||
|
||||
// Command to run command inside a docker container
|
||||
dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
|
||||
// initialize source codes
|
||||
def checkoutSrcs() {
|
||||
retry(5) {
|
||||
try {
|
||||
timeout(time: 2, unit: 'MINUTES') {
|
||||
checkout scm
|
||||
sh 'git submodule update --init'
|
||||
}
|
||||
} catch (exc) {
|
||||
deleteDir()
|
||||
error "Failed to fetch source codes"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates cmake and make builds
|
||||
*/
|
||||
def buildFactory(buildName, conf, restricted, build_func) {
|
||||
def os = conf["os"]
|
||||
def device = conf["withGpu"] ? "gpu" : "cpu"
|
||||
def restricted_flag = restricted ? "restricted" : "unrestricted"
|
||||
def nodeReq = "${os} && ${device} && ${restricted_flag}"
|
||||
def dockerTarget = conf["withGpu"] ? "gpu" : "cpu"
|
||||
[ ("${buildName}") : { build_func("${buildName}", conf, nodeReq, dockerTarget) }
|
||||
]
|
||||
}
|
||||
|
||||
def cmakeOptions(conf) {
|
||||
return ([
|
||||
conf["withGpu"] ? '-DUSE_CUDA=ON' : '-DUSE_CUDA=OFF',
|
||||
conf["withNccl"] ? '-DUSE_NCCL=ON' : '-DUSE_NCCL=OFF',
|
||||
conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : '']
|
||||
).join(" ")
|
||||
}
|
||||
|
||||
def getBuildName(conf) {
|
||||
def gpuLabel = conf['withGpu'] ? ("_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu"
|
||||
def ompLabel = conf['withOmp'] ? "_omp" : ""
|
||||
def pyLabel = "_py${conf['pythonVersion']}"
|
||||
return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"
|
||||
}
|
||||
|
||||
return this
|
||||
@ -4,6 +4,6 @@ set -e
|
||||
cd python-package
|
||||
python setup.py install --user
|
||||
cd ..
|
||||
python -m nose --attr='!slow' tests/python-gpu/
|
||||
python -m nose -v --attr='!slow' tests/python-gpu/
|
||||
./testxgboost
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ class TestGPUPredict(unittest.TestCase):
|
||||
# Test case for a bug where multiple batch predictions made on a test set produce incorrect results
|
||||
def test_multi_predict(self):
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.cross_validation import train_test_split
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
n = 1000
|
||||
X, y = make_regression(n, random_state=rng)
|
||||
|
||||
@ -22,21 +22,13 @@ class TemporaryDirectory(object):
|
||||
def test_binary_classification():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_digits
|
||||
try:
|
||||
from sklearn.model_selection import KFold
|
||||
except:
|
||||
from sklearn.cross_validation import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
digits = load_digits(2)
|
||||
y = digits['target']
|
||||
X = digits['data']
|
||||
try:
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
except TypeError: # sklearn.model_selection.KFold uses n_split
|
||||
kf = KFold(
|
||||
n_splits=2, shuffle=True, random_state=rng
|
||||
).split(np.arange(y.shape[0]))
|
||||
for train_index, test_index in kf:
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
labels = y[test_index]
|
||||
@ -48,10 +40,7 @@ def test_binary_classification():
|
||||
def test_multiclass_classification():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_iris
|
||||
try:
|
||||
from sklearn.cross_validation import KFold
|
||||
except:
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
def check_pred(preds, labels):
|
||||
err = sum(1 for i in range(len(preds))
|
||||
@ -61,8 +50,8 @@ def test_multiclass_classification():
|
||||
iris = load_iris()
|
||||
y = iris['target']
|
||||
X = iris['data']
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf:
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
# test other params in XGBClassifier().fit
|
||||
@ -111,13 +100,13 @@ def test_boston_housing_regression():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.cross_validation import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
boston = load_boston()
|
||||
y = boston['target']
|
||||
X = boston['data']
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf:
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
|
||||
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
@ -135,7 +124,7 @@ def test_boston_housing_regression():
|
||||
|
||||
def test_parameter_tuning():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.grid_search import GridSearchCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.datasets import load_boston
|
||||
|
||||
boston = load_boston()
|
||||
@ -143,7 +132,8 @@ def test_parameter_tuning():
|
||||
X = boston['data']
|
||||
xgb_model = xgb.XGBRegressor()
|
||||
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
|
||||
'n_estimators': [50, 100, 200]}, verbose=1)
|
||||
'n_estimators': [50, 100, 200]},
|
||||
cv=3, verbose=1, iid=True)
|
||||
clf.fit(X, y)
|
||||
assert clf.best_score_ < 0.7
|
||||
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
|
||||
@ -153,7 +143,7 @@ def test_regression_with_custom_objective():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.cross_validation import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
def objective_ls(y_true, y_pred):
|
||||
grad = (y_pred - y_true)
|
||||
@ -163,8 +153,8 @@ def test_regression_with_custom_objective():
|
||||
boston = load_boston()
|
||||
y = boston['target']
|
||||
X = boston['data']
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf:
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
|
||||
X[train_index], y[train_index]
|
||||
)
|
||||
@ -186,7 +176,7 @@ def test_regression_with_custom_objective():
|
||||
def test_classification_with_custom_objective():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.cross_validation import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
def logregobj(y_true, y_pred):
|
||||
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
|
||||
@ -197,8 +187,8 @@ def test_classification_with_custom_objective():
|
||||
digits = load_digits(2)
|
||||
y = digits['target']
|
||||
X = digits['data']
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf:
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBClassifier(objective=logregobj)
|
||||
xgb_model.fit(X[train_index], y[train_index])
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
@ -225,10 +215,11 @@ def test_classification_with_custom_objective():
|
||||
def test_sklearn_api():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.cross_validation import train_test_split
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
iris = load_iris()
|
||||
tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
|
||||
tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target,
|
||||
train_size=120, test_size=0.2)
|
||||
|
||||
classifier = xgb.XGBClassifier(booster='gbtree', n_estimators=10)
|
||||
classifier.fit(tr_d, tr_l)
|
||||
@ -242,7 +233,7 @@ def test_sklearn_api():
|
||||
def test_sklearn_api_gblinear():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.cross_validation import train_test_split
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
iris = load_iris()
|
||||
tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
|
||||
@ -476,23 +467,15 @@ def test_validation_weights_xgbclassifier():
|
||||
def test_save_load_model():
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_digits
|
||||
try:
|
||||
from sklearn.model_selection import KFold
|
||||
except:
|
||||
from sklearn.cross_validation import KFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
digits = load_digits(2)
|
||||
y = digits['target']
|
||||
X = digits['data']
|
||||
try:
|
||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||
except TypeError: # sklearn.model_selection.KFold uses n_split
|
||||
kf = KFold(
|
||||
n_splits=2, shuffle=True, random_state=rng
|
||||
).split(np.arange(y.shape[0]))
|
||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||
with TemporaryDirectory() as tempdir:
|
||||
model_path = os.path.join(tempdir, 'digits.model')
|
||||
for train_index, test_index in kf:
|
||||
for train_index, test_index in kf.split(X, y):
|
||||
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
||||
xgb_model.save_model(model_path)
|
||||
xgb_model = xgb.XGBModel()
|
||||
|
||||
@ -56,7 +56,7 @@ if [ ${TASK} == "python_test" ]; then
|
||||
python -m pip install datatable --no-binary datatable
|
||||
|
||||
python -m pip install graphviz pytest pytest-cov codecov
|
||||
python -m nose tests/python || exit -1
|
||||
python -m nose -v tests/python || exit -1
|
||||
py.test tests/python --cov=python-package/xgboost
|
||||
codecov
|
||||
source activate python2
|
||||
@ -64,7 +64,7 @@ if [ ${TASK} == "python_test" ]; then
|
||||
python --version
|
||||
conda install numpy scipy pandas matplotlib nose scikit-learn
|
||||
python -m pip install graphviz
|
||||
python -m nose tests/python || exit -1
|
||||
python -m nose -v tests/python || exit -1
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@ -75,7 +75,7 @@ if [ ${TASK} == "python_lightweight_test" ]; then
|
||||
python --version
|
||||
conda install numpy scipy nose
|
||||
python -m pip install graphviz pytest pytest-cov codecov
|
||||
python -m nose tests/python || exit -1
|
||||
python -m nose -v tests/python || exit -1
|
||||
py.test tests/python --cov=python-package/xgboost
|
||||
codecov
|
||||
source activate python2
|
||||
@ -83,7 +83,7 @@ if [ ${TASK} == "python_lightweight_test" ]; then
|
||||
python --version
|
||||
conda install numpy scipy nose
|
||||
python -m pip install graphviz
|
||||
python -m nose tests/python || exit -1
|
||||
python -m nose -v tests/python || exit -1
|
||||
python -m pip install flake8==3.4.1
|
||||
flake8 --ignore E501 python-package || exit -1
|
||||
flake8 --ignore E501 tests/python || exit -1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user