Compare commits
20 Commits
v0.80
...
release_0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c152d730c | ||
|
|
bc35b8e97b | ||
|
|
b1233ef2ae | ||
|
|
a8d815fc1e | ||
|
|
1ade39d73c | ||
|
|
a46f13cee1 | ||
|
|
953ed1a99b | ||
|
|
d1c250f8cf | ||
|
|
4b0339c75d | ||
|
|
4059b932e2 | ||
|
|
254e816711 | ||
|
|
71eaa26c7b | ||
|
|
8ba1b8f5ed | ||
|
|
4334b9cc91 | ||
|
|
e19dded9a3 | ||
|
|
d0f45bede0 | ||
|
|
23bc3fc4aa | ||
|
|
e1f0715981 | ||
|
|
229ae58259 | ||
|
|
1467109ae1 |
@@ -5,6 +5,7 @@ CheckOptions:
|
||||
- { key: readability-identifier-naming.TypeAliasCase, value: CamelCase }
|
||||
- { key: readability-identifier-naming.TypedefCase, value: CamelCase }
|
||||
- { key: readability-identifier-naming.TypeTemplateParameterCase, value: CamelCase }
|
||||
- { key: readability-identifier-naming.LocalVariableCase, value: lower_case }
|
||||
- { key: readability-identifier-naming.MemberCase, value: lower_case }
|
||||
- { key: readability-identifier-naming.PrivateMemberSuffix, value: '_' }
|
||||
- { key: readability-identifier-naming.ProtectedMemberSuffix, value: '_' }
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
root = true
|
||||
|
||||
[*]
|
||||
charset=utf-8
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
insert_final_newline = true
|
||||
|
||||
[*.py]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
7
.github/ISSUE_TEMPLATE.md
vendored
7
.github/ISSUE_TEMPLATE.md
vendored
@@ -1,7 +0,0 @@
|
||||
Thanks for participating in the XGBoost community! We use https://discuss.xgboost.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :)
|
||||
|
||||
Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
|
||||
|
||||
For bug reports, to help the developer act on the issues, please include a description of your environment, preferably a minimum script to reproduce the problem.
|
||||
|
||||
For feature proposals, list clear, small actionable items so we can track the progress of the change.
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -4,6 +4,9 @@
|
||||
[submodule "rabit"]
|
||||
path = rabit
|
||||
url = https://github.com/dmlc/rabit
|
||||
[submodule "nccl"]
|
||||
path = nccl
|
||||
url = https://github.com/dmlc/nccl
|
||||
[submodule "cub"]
|
||||
path = cub
|
||||
url = https://github.com/NVlabs/cub
|
||||
|
||||
@@ -26,8 +26,6 @@ env:
|
||||
- TASK=cmake_test
|
||||
# c++ test
|
||||
- TASK=cpp_test
|
||||
# distributed test
|
||||
- TASK=distributed_test
|
||||
|
||||
matrix:
|
||||
exclude:
|
||||
@@ -41,8 +39,6 @@ matrix:
|
||||
env: TASK=python_lightweight_test
|
||||
- os: osx
|
||||
env: TASK=cpp_test
|
||||
- os: osx
|
||||
env: TASK=distributed_test
|
||||
|
||||
# dependent apt packages
|
||||
addons:
|
||||
|
||||
@@ -8,18 +8,14 @@ set_default_configuration_release()
|
||||
msvc_use_static_runtime()
|
||||
|
||||
# Options
|
||||
option(USE_CUDA "Build with GPU acceleration")
|
||||
option(USE_AVX "Build with AVX instructions. May not produce identical results due to approximate math." OFF)
|
||||
option(USE_NCCL "Build using NCCL for multi-GPU. Also requires USE_CUDA")
|
||||
option(USE_CUDA "Build with GPU acceleration")
|
||||
option(USE_AVX "Build with AVX instructions. May not produce identical results due to approximate math." OFF)
|
||||
option(USE_NCCL "Build using NCCL for multi-GPU. Also requires USE_CUDA")
|
||||
option(JVM_BINDINGS "Build JVM bindings" OFF)
|
||||
option(GOOGLE_TEST "Build google tests" OFF)
|
||||
option(R_LIB "Build shared library for R package" OFF)
|
||||
option(USE_SANITIZER "Use santizer flags" OFF)
|
||||
set(GPU_COMPUTE_VER "" CACHE STRING
|
||||
"Space separated list of compute versions to be built against, e.g. '35 61'")
|
||||
set(ENABLED_SANITIZERS "address" "leak" CACHE STRING
|
||||
"Semicolon separated list of sanitizer names. E.g 'address;leak'. Supported sanitizers are
|
||||
address, leak and thread.")
|
||||
|
||||
# Deprecation warning
|
||||
if(PLUGIN_UPDATER_GPU)
|
||||
@@ -43,15 +39,6 @@ else()
|
||||
# Performance
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops")
|
||||
endif()
|
||||
if(WIN32 AND MINGW)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
|
||||
endif()
|
||||
|
||||
# Sanitizer
|
||||
if(USE_SANITIZER)
|
||||
include(cmake/Sanitizer.cmake)
|
||||
enable_sanitizers("${ENABLED_SANITIZERS}")
|
||||
endif(USE_SANITIZER)
|
||||
|
||||
# AVX
|
||||
if(USE_AVX)
|
||||
@@ -63,12 +50,6 @@ if(USE_AVX)
|
||||
add_definitions(-DXGBOOST_USE_AVX)
|
||||
endif()
|
||||
|
||||
# dmlc-core
|
||||
add_subdirectory(dmlc-core)
|
||||
set(LINK_LIBRARIES dmlc rabit)
|
||||
|
||||
# enable custom logging
|
||||
add_definitions(-DDMLC_LOG_CUSTOMIZE=1)
|
||||
|
||||
# compiled code customizations for R package
|
||||
if(R_LIB)
|
||||
@@ -89,7 +70,7 @@ include_directories (
|
||||
${PROJECT_SOURCE_DIR}/rabit/include
|
||||
)
|
||||
|
||||
file(GLOB_RECURSE SOURCES
|
||||
file(GLOB_RECURSE SOURCES
|
||||
src/*.cc
|
||||
src/*.h
|
||||
include/*.h
|
||||
@@ -122,17 +103,22 @@ else()
|
||||
add_library(rabit STATIC ${RABIT_SOURCES})
|
||||
endif()
|
||||
|
||||
|
||||
# dmlc-core
|
||||
add_subdirectory(dmlc-core)
|
||||
set(LINK_LIBRARIES dmlc rabit)
|
||||
|
||||
|
||||
if(USE_CUDA)
|
||||
find_package(CUDA 8.0 REQUIRED)
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
|
||||
add_definitions(-DXGBOOST_USE_CUDA)
|
||||
|
||||
|
||||
include_directories(cub)
|
||||
|
||||
if(USE_NCCL)
|
||||
find_package(Nccl REQUIRED)
|
||||
include_directories(${NCCL_INCLUDE_DIR})
|
||||
include_directories(nccl/src)
|
||||
add_definitions(-DXGBOOST_USE_NCCL)
|
||||
endif()
|
||||
|
||||
@@ -145,13 +131,16 @@ if(USE_CUDA)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
|
||||
endif()
|
||||
|
||||
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
|
||||
|
||||
if(USE_NCCL)
|
||||
link_directories(${NCCL_LIBRARY})
|
||||
target_link_libraries(gpuxgboost ${NCCL_LIB_NAME})
|
||||
add_subdirectory(nccl)
|
||||
endif()
|
||||
list(APPEND LINK_LIBRARIES gpuxgboost)
|
||||
|
||||
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
|
||||
|
||||
if(USE_NCCL)
|
||||
target_link_libraries(gpuxgboost nccl)
|
||||
endif()
|
||||
list(APPEND LINK_LIBRARIES gpuxgboost)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -232,12 +221,12 @@ endif()
|
||||
|
||||
# Test
|
||||
if(GOOGLE_TEST)
|
||||
enable_testing()
|
||||
find_package(GTest REQUIRED)
|
||||
enable_testing()
|
||||
|
||||
file(GLOB_RECURSE TEST_SOURCES "tests/cpp/*.cc")
|
||||
auto_source_group("${TEST_SOURCES}")
|
||||
include_directories(${GTEST_INCLUDE_DIRS})
|
||||
include_directories(${GTEST_INCLUDE_DIR})
|
||||
|
||||
if(USE_CUDA)
|
||||
file(GLOB_RECURSE CUDA_TEST_SOURCES "tests/cpp/*.cu")
|
||||
|
||||
@@ -73,8 +73,3 @@ List of Contributors
|
||||
* [Gideon Whitehead](https://github.com/gaw89)
|
||||
* [Yi-Lin Juang](https://github.com/frankyjuang)
|
||||
* [Andrew Hannigan](https://github.com/andrewhannigan)
|
||||
* [Andy Adinets](https://github.com/canonizer)
|
||||
* [Henry Gouk](https://github.com/henrygouk)
|
||||
* [Pierre de Sahb](https://github.com/pdesahb)
|
||||
* [liuliang01](https://github.com/liuliang01)
|
||||
- liuliang01 added support for the qid column for LibSVM input format. This makes ranking task easier in distributed setting.
|
||||
|
||||
44
ISSUE_TEMPLATE.md
Normal file
44
ISSUE_TEMPLATE.md
Normal file
@@ -0,0 +1,44 @@
|
||||
For bugs or installation issues, please provide the following information.
|
||||
The more information you provide, the more easily we will be able to offer
|
||||
help and advice.
|
||||
|
||||
## Environment info
|
||||
Operating System:
|
||||
|
||||
Compiler:
|
||||
|
||||
Package used (python/R/jvm/C++):
|
||||
|
||||
`xgboost` version used:
|
||||
|
||||
If installing from source, please provide
|
||||
|
||||
1. The commit hash (`git rev-parse HEAD`)
|
||||
2. Logs will be helpful (If logs are large, please upload as attachment).
|
||||
|
||||
If you are using jvm package, please
|
||||
|
||||
1. add [jvm-packages] in the title to make it quickly be identified
|
||||
2. the gcc version and distribution
|
||||
|
||||
If you are using python package, please provide
|
||||
|
||||
1. The python version and distribution
|
||||
2. The command to install `xgboost` if you are not installing from source
|
||||
|
||||
If you are using R package, please provide
|
||||
|
||||
1. The R `sessionInfo()`
|
||||
2. The command to install `xgboost` if you are not installing from source
|
||||
|
||||
## Steps to reproduce
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
## What have you tried?
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
105
Jenkinsfile
vendored
105
Jenkinsfile
vendored
@@ -3,11 +3,18 @@
|
||||
// Jenkins pipeline
|
||||
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
|
||||
|
||||
import groovy.transform.Field
|
||||
|
||||
/* Unrestricted tasks: tasks that do NOT generate artifacts */
|
||||
|
||||
// Command to run command inside a docker container
|
||||
dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
def dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
// Utility functions
|
||||
@Field
|
||||
def utils
|
||||
|
||||
def buildMatrix = [
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.1" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
]
|
||||
@@ -26,42 +33,25 @@ pipeline {
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Get sources') {
|
||||
agent any
|
||||
stage('Jenkins: Get sources') {
|
||||
agent {
|
||||
label 'unrestricted'
|
||||
}
|
||||
steps {
|
||||
checkoutSrcs()
|
||||
script {
|
||||
utils = load('tests/ci_build/jenkins_tools.Groovy')
|
||||
utils.checkoutSrcs()
|
||||
}
|
||||
stash name: 'srcs', excludes: '.git/'
|
||||
milestone label: 'Sources ready', ordinal: 1
|
||||
}
|
||||
}
|
||||
stage('Build doc') {
|
||||
agent any
|
||||
steps {
|
||||
script {
|
||||
if (env.CHANGE_ID == null) { // This is a branch
|
||||
def commit_id = "${GIT_COMMIT}"
|
||||
def branch_name = "${GIT_LOCAL_BRANCH}"
|
||||
echo 'Building doc...'
|
||||
dir ('jvm-packages') {
|
||||
sh "bash ./build_doc.sh ${commit_id}"
|
||||
archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
|
||||
echo 'Deploying doc...'
|
||||
withAWS(credentials:'xgboost-doc-bucket') {
|
||||
s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
|
||||
}
|
||||
}
|
||||
} else { // This is a pull request
|
||||
echo 'Skipping doc build step for pull request'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Build & Test') {
|
||||
stage('Jenkins: Build & Test') {
|
||||
steps {
|
||||
script {
|
||||
parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
|
||||
def buildName = getBuildName(c)
|
||||
buildFactory(buildName, c)
|
||||
def buildName = utils.getBuildName(c)
|
||||
utils.buildFactory(buildName, c, false, this.&buildPlatformCmake)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -69,37 +59,11 @@ pipeline {
|
||||
}
|
||||
}
|
||||
|
||||
// initialize source codes
|
||||
def checkoutSrcs() {
|
||||
retry(5) {
|
||||
try {
|
||||
timeout(time: 2, unit: 'MINUTES') {
|
||||
checkout scm
|
||||
sh 'git submodule update --init'
|
||||
}
|
||||
} catch (exc) {
|
||||
deleteDir()
|
||||
error "Failed to fetch source codes"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates cmake and make builds
|
||||
*/
|
||||
def buildFactory(buildName, conf) {
|
||||
def os = conf["os"]
|
||||
def nodeReq = conf["withGpu"] ? "${os} && gpu" : "${os}"
|
||||
def dockerTarget = conf["withGpu"] ? "gpu" : "cpu"
|
||||
[ ("${buildName}") : { buildPlatformCmake("${buildName}", conf, nodeReq, dockerTarget) }
|
||||
]
|
||||
}
|
||||
|
||||
/**
|
||||
* Build platform and test it via cmake.
|
||||
*/
|
||||
def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
def opts = cmakeOptions(conf)
|
||||
def opts = utils.cmakeOptions(conf)
|
||||
// Destination dir for artifacts
|
||||
def distDir = "dist/${buildName}"
|
||||
def dockerArgs = ""
|
||||
@@ -119,33 +83,6 @@ def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
sh """
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/test_${dockerTarget}.sh
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
|
||||
rm -rf "${distDir}"; mkdir -p "${distDir}/py"
|
||||
cp xgboost "${distDir}"
|
||||
cp -r lib "${distDir}"
|
||||
cp -r python-package/dist "${distDir}/py"
|
||||
# Test the wheel for compatibility on a barebones CPU container
|
||||
${dockerRun} release ${dockerArgs} bash -c " \
|
||||
auditwheel show xgboost-*-py2-none-any.whl
|
||||
pip install --user python-package/dist/xgboost-*-none-any.whl && \
|
||||
python -m nose tests/python"
|
||||
"""
|
||||
archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
|
||||
}
|
||||
}
|
||||
|
||||
def cmakeOptions(conf) {
|
||||
return ([
|
||||
conf["withGpu"] ? '-DUSE_CUDA=ON' : '-DUSE_CUDA=OFF',
|
||||
conf["withNccl"] ? '-DUSE_NCCL=ON' : '-DUSE_NCCL=OFF',
|
||||
conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : '']
|
||||
).join(" ")
|
||||
}
|
||||
|
||||
def getBuildName(conf) {
|
||||
def gpuLabel = conf['withGpu'] ? ("_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu"
|
||||
def ompLabel = conf['withOmp'] ? "_omp" : ""
|
||||
def pyLabel = "_py${conf['pythonVersion']}"
|
||||
return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"
|
||||
}
|
||||
|
||||
|
||||
121
Jenkinsfile-restricted
Normal file
121
Jenkinsfile-restricted
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/groovy
|
||||
// -*- mode: groovy -*-
|
||||
// Jenkins pipeline
|
||||
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
|
||||
|
||||
import groovy.transform.Field
|
||||
|
||||
/* Restricted tasks: tasks generating artifacts, such as binary wheels and
|
||||
documentation */
|
||||
|
||||
// Command to run command inside a docker container
|
||||
def dockerRun = 'tests/ci_build/ci_build.sh'
|
||||
// Utility functions
|
||||
@Field
|
||||
def utils
|
||||
|
||||
def buildMatrix = [
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": true, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
[ "enabled": true, "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
|
||||
]
|
||||
|
||||
pipeline {
|
||||
// Each stage specify its own agent
|
||||
agent none
|
||||
|
||||
// Setup common job properties
|
||||
options {
|
||||
ansiColor('xterm')
|
||||
timestamps()
|
||||
timeout(time: 120, unit: 'MINUTES')
|
||||
buildDiscarder(logRotator(numToKeepStr: '10'))
|
||||
}
|
||||
|
||||
// Build stages
|
||||
stages {
|
||||
stage('Jenkins: Get sources') {
|
||||
agent {
|
||||
label 'restricted'
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
utils = load('tests/ci_build/jenkins_tools.Groovy')
|
||||
utils.checkoutSrcs()
|
||||
}
|
||||
stash name: 'srcs', excludes: '.git/'
|
||||
milestone label: 'Sources ready', ordinal: 1
|
||||
}
|
||||
}
|
||||
stage('Jenkins: Build doc') {
|
||||
agent {
|
||||
label 'linux && cpu && restricted'
|
||||
}
|
||||
steps {
|
||||
unstash name: 'srcs'
|
||||
script {
|
||||
def commit_id = "${GIT_COMMIT}"
|
||||
def branch_name = "${GIT_LOCAL_BRANCH}"
|
||||
echo 'Building doc...'
|
||||
dir ('jvm-packages') {
|
||||
sh "bash ./build_doc.sh ${commit_id}"
|
||||
archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
|
||||
echo 'Deploying doc...'
|
||||
withAWS(credentials:'xgboost-doc-bucket') {
|
||||
s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Jenkins: Build artifacts') {
|
||||
steps {
|
||||
script {
|
||||
parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
|
||||
def buildName = utils.getBuildName(c)
|
||||
utils.buildFactory(buildName, c, true, this.&buildPlatformCmake)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build platform and test it via cmake.
|
||||
*/
|
||||
def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
|
||||
def opts = utils.cmakeOptions(conf)
|
||||
// Destination dir for artifacts
|
||||
def distDir = "dist/${buildName}"
|
||||
def dockerArgs = ""
|
||||
if(conf["withGpu"]){
|
||||
dockerArgs = "--build-arg CUDA_VERSION=" + conf["cudaVersion"]
|
||||
}
|
||||
// Build node - this is returned result
|
||||
node(nodeReq) {
|
||||
unstash name: 'srcs'
|
||||
echo """
|
||||
|===== XGBoost CMake build =====
|
||||
| dockerTarget: ${dockerTarget}
|
||||
| cmakeOpts : ${opts}
|
||||
|=========================
|
||||
""".stripMargin('|')
|
||||
// Invoke command inside docker
|
||||
sh """
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
|
||||
${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
|
||||
rm -rf "${distDir}"; mkdir -p "${distDir}/py"
|
||||
cp xgboost "${distDir}"
|
||||
cp -r lib "${distDir}"
|
||||
cp -r python-package/dist "${distDir}/py"
|
||||
# Test the wheel for compatibility on a barebones CPU container
|
||||
${dockerRun} release ${dockerArgs} bash -c " \
|
||||
auditwheel show xgboost-*-py2-none-any.whl
|
||||
pip install --user python-package/dist/xgboost-*-none-any.whl && \
|
||||
python -m nose tests/python"
|
||||
"""
|
||||
archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
|
||||
}
|
||||
}
|
||||
2
Makefile
2
Makefile
@@ -68,7 +68,7 @@ endif
|
||||
endif
|
||||
|
||||
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
|
||||
export CFLAGS= -DDMLC_LOG_CUSTOMIZE=1 -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
|
||||
export CFLAGS= -std=c++11 -Wall -Wno-unknown-pragmas -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
|
||||
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include -I$(GTEST_PATH)/include
|
||||
#java include path
|
||||
export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
|
||||
|
||||
52
NEWS.md
52
NEWS.md
@@ -3,58 +3,6 @@ XGBoost Change Log
|
||||
|
||||
This file records the changes in xgboost library in reverse chronological order.
|
||||
|
||||
## v0.80 (2018.08.13)
|
||||
* **JVM packages received a major upgrade**: To consolidate the APIs and improve the user experience, we refactored the design of XGBoost4J-Spark in a significant manner. (#3387)
|
||||
- Consolidated APIs: It is now much easier to integrate XGBoost models into a Spark ML pipeline. Users can control behaviors like output leaf prediction results by setting corresponding column names. Training is now more consistent with other Estimators in Spark MLLIB: there is now one single method `fit()` to train decision trees.
|
||||
- Better user experience: we refactored the parameters relevant modules in XGBoost4J-Spark to provide both camel-case (Spark ML style) and underscore (XGBoost style) parameters
|
||||
- A brand-new tutorial is [available](https://xgboost.readthedocs.io/en/release_0.80/jvm/xgboost4j_spark_tutorial.html) for XGBoost4J-Spark.
|
||||
- Latest API documentation is now hosted at https://xgboost.readthedocs.io/.
|
||||
* XGBoost documentation now keeps track of multiple versions:
|
||||
- Latest master: https://xgboost.readthedocs.io/en/latest
|
||||
- 0.80 stable: https://xgboost.readthedocs.io/en/release_0.80
|
||||
- 0.72 stable: https://xgboost.readthedocs.io/en/release_0.72
|
||||
* Ranking task now uses instance weights (#3379)
|
||||
* Fix inaccurate decimal parsing (#3546)
|
||||
* New functionality
|
||||
- Query ID column support in LIBSVM data files (#2749). This is convenient for performing ranking task in distributed setting.
|
||||
- Hinge loss for binary classification (`binary:hinge`) (#3477)
|
||||
- Ability to specify delimiter and instance weight column for CSV files (#3546)
|
||||
- Ability to use 1-based indexing instead of 0-based (#3546)
|
||||
* GPU support
|
||||
- Quantile sketch, binning, and index compression are now performed on GPU, eliminating PCIe transfer for 'gpu_hist' algorithm (#3319, #3393)
|
||||
- Upgrade to NCCL2 for multi-GPU training (#3404).
|
||||
- Use shared memory atomics for faster training (#3384).
|
||||
- Dynamically allocate GPU memory, to prevent large allocations for deep trees (#3519)
|
||||
- Fix memory copy bug for large files (#3472)
|
||||
* Python package
|
||||
- Importing data from Python datatable (#3272)
|
||||
- Pre-built binary wheels available for 64-bit Linux and Windows (#3424, #3443)
|
||||
- Add new importance measures 'total_gain', 'total_cover' (#3498)
|
||||
- Sklearn API now supports saving and loading models (#3192)
|
||||
- Arbitrary cross validation fold indices (#3353)
|
||||
- `predict()` function in Sklearn API uses `best_ntree_limit` if available, to make early stopping easier to use (#3445)
|
||||
- Informational messages are now directed to Python's `print()` rather than standard output (#3438). This way, messages appear inside Jupyter notebooks.
|
||||
* R package
|
||||
- Oracle Solaris support, per CRAN policy (#3372)
|
||||
* JVM packages
|
||||
- Single-instance prediction (#3464)
|
||||
- Pre-built JARs are now available from Maven Central (#3401)
|
||||
- Add NULL pointer check (#3021)
|
||||
- Consider `spark.task.cpus` when controlling parallelism (#3530)
|
||||
- Handle missing values in prediction (#3529)
|
||||
- Eliminate outputs of `System.out` (#3572)
|
||||
* Refactored C++ DMatrix class for simplicity and de-duplication (#3301)
|
||||
* Refactored C++ histogram facilities (#3564)
|
||||
* Refactored constraints / regularization mechanism for split finding (#3335, #3429). Users may specify an elastic net (L2 + L1 regularization) on leaf weights as well as monotonic constraints on test nodes. The refactor will be useful for a future addition of feature interaction constraints.
|
||||
* Statically link `libstdc++` for MinGW32 (#3430)
|
||||
* Enable loading from `group`, `base_margin` and `weight` (see [here](http://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#auxiliary-files-for-additional-information)) for Python, R, and JVM packages (#3431)
|
||||
* Fix model saving for `count:possion` so that `max_delta_step` doesn't get truncated (#3515)
|
||||
* Fix loading of sparse CSC matrix (#3553)
|
||||
* Fix incorrect handling of `base_score` parameter for Tweedie regression (#3295)
|
||||
|
||||
## v0.72.1 (2018.07.08)
|
||||
This version is only applicable for the Python package. The content is identical to that of v0.72.
|
||||
|
||||
## v0.72 (2018.06.01)
|
||||
* Starting with this release, we plan to make a new release every two months. See #3252 for more details.
|
||||
* Fix a pathological behavior (near-zero second-order gradients) in multiclass objective (#3304)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 0.80.1
|
||||
Date: 2018-08-13
|
||||
Version: 0.71.1
|
||||
Date: 2018-05-11
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
email = "tianqi.tchen@gmail.com"),
|
||||
@@ -51,7 +51,6 @@ Suggests:
|
||||
Ckmeans.1d.dp (>= 3.3.1),
|
||||
vcd (>= 1.3),
|
||||
testthat,
|
||||
lintr,
|
||||
igraph (>= 1.0.1)
|
||||
Depends:
|
||||
R (>= 3.3.0)
|
||||
|
||||
@@ -212,7 +212,6 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
}
|
||||
if (plot && which == "2d") {
|
||||
# TODO
|
||||
warning("Bivariate plotting is currently not available.")
|
||||
}
|
||||
invisible(list(data = data, shap_contrib = shap_contrib))
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
|
||||
# disable the use of thread_local for 32 bit windows:
|
||||
ifeq ($(R_OSTYPE)$(WIN),windows)
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0 -msse2 -mfpmath=sse
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
|
||||
endif
|
||||
$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
|
||||
|
||||
# disable the use of thread_local for 32 bit windows:
|
||||
ifeq ($(R_OSTYPE)$(WIN),windows)
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0 -msse2 -mfpmath=sse
|
||||
XGB_RFLAGS += -DDMLC_CXX11_THREAD_LOCAL=0
|
||||
endif
|
||||
$(foreach v, $(XGB_RFLAGS), $(warning $(v)))
|
||||
|
||||
|
||||
@@ -77,18 +77,6 @@ test_that("xgb.DMatrix: slice, dim", {
|
||||
expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label'))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: slice, trailing empty rows", {
|
||||
data(agaricus.train, package='xgboost')
|
||||
train_data <- agaricus.train$data
|
||||
train_label <- agaricus.train$label
|
||||
dtrain <- xgb.DMatrix(data=train_data, label=train_label)
|
||||
slice(dtrain, 6513L)
|
||||
train_data[6513, ] <- 0
|
||||
dtrain <- xgb.DMatrix(data=train_data, label=train_label)
|
||||
slice(dtrain, 6513L)
|
||||
expect_equal(nrow(dtrain), 6513)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: colnames", {
|
||||
dtest <- xgb.DMatrix(test_data, label=test_label)
|
||||
expect_equal(colnames(dtest), colnames(test_data))
|
||||
|
||||
38
README.md
38
README.md
@@ -6,28 +6,46 @@
|
||||
[](./LICENSE)
|
||||
[](http://cran.r-project.org/web/packages/xgboost)
|
||||
[](https://pypi.python.org/pypi/xgboost/)
|
||||
[](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
[Community](https://xgboost.ai/community) |
|
||||
[Documentation](https://xgboost.readthedocs.org) |
|
||||
[Resources](demo/README.md) |
|
||||
[Contributors](CONTRIBUTORS.md) |
|
||||
[Release Notes](NEWS.md)
|
||||
[Installation](https://xgboost.readthedocs.org/en/latest/build.html) |
|
||||
[Release Notes](NEWS.md) |
|
||||
[RoadMap](https://github.com/dmlc/xgboost/issues/873)
|
||||
|
||||
XGBoost is an optimized distributed gradient boosting library designed to be highly ***efficient***, ***flexible*** and ***portable***.
|
||||
It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework.
|
||||
XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way.
|
||||
The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.
|
||||
|
||||
What's New
|
||||
----------
|
||||
* [XGBoost GPU support with fast histogram algorithm](https://github.com/dmlc/xgboost/tree/master/plugin/updater_gpu)
|
||||
* [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages)
|
||||
* [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html)
|
||||
* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.io/en/latest/tutorials/aws_yarn.html)
|
||||
* [XGBoost brick](NEWS.md) Release
|
||||
|
||||
Ask a Question
|
||||
--------------
|
||||
* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
|
||||
* For generic questions or to share your experience using XGBoost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
|
||||
|
||||
Help to Make XGBoost Better
|
||||
---------------------------
|
||||
XGBoost has been developed and used by a group of active community members. Your help is very valuable to make the package better for everyone.
|
||||
- Check out [call for contributions](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Acall-for-contribution+is%3Aopen) and [Roadmap](https://github.com/dmlc/xgboost/issues/873) to see what can be improved, or open an issue if you want something.
|
||||
- Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users.
|
||||
- Add your stories and experience to [Awesome XGBoost](demo/README.md).
|
||||
- Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) and after your patch has been merged.
|
||||
- Please also update [NEWS.md](NEWS.md) on changes and improvements in API and docs.
|
||||
|
||||
License
|
||||
-------
|
||||
© Contributors, 2016. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
|
||||
|
||||
Contribute to XGBoost
|
||||
---------------------
|
||||
XGBoost has been developed and used by a group of active community members. Your help is very valuable to make the package better for everyone.
|
||||
Checkout the [Community Page](https://xgboost.ai/community)
|
||||
|
||||
Reference
|
||||
---------
|
||||
- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
|
||||
- XGBoost originates from research project at University of Washington.
|
||||
- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
|
||||
- XGBoost originates from research project at University of Washington, see also the [Project Page at UW](http://dmlc.cs.washington.edu/xgboost.html).
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include "../src/objective/regression_obj.cc"
|
||||
#include "../src/objective/multiclass_obj.cc"
|
||||
#include "../src/objective/rank_obj.cc"
|
||||
#include "../src/objective/hinge.cc"
|
||||
|
||||
// gbms
|
||||
#include "../src/gbm/gbm.cc"
|
||||
@@ -44,7 +43,6 @@
|
||||
#endif
|
||||
|
||||
// tress
|
||||
#include "../src/tree/split_evaluator.cc"
|
||||
#include "../src/tree/tree_model.cc"
|
||||
#include "../src/tree/tree_updater.cc"
|
||||
#include "../src/tree/updater_colmaker.cc"
|
||||
|
||||
@@ -52,10 +52,8 @@ install:
|
||||
Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "$Env:TEMP\appveyor-tool.ps1"
|
||||
Import-Module "$Env:TEMP\appveyor-tool.ps1"
|
||||
Bootstrap
|
||||
$DEPS = "c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')"
|
||||
$DEPS = "c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','igraph','knitr','rmarkdown')"
|
||||
cmd.exe /c "R.exe -q -e ""install.packages($DEPS, repos='$CRAN', type='both')"" 2>&1"
|
||||
$BINARY_DEPS = "c('XML','igraph')"
|
||||
cmd.exe /c "R.exe -q -e ""install.packages($BINARY_DEPS, repos='$CRAN', type='win.binary')"" 2>&1"
|
||||
}
|
||||
|
||||
build_script:
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
# Set appropriate compiler and linker flags for sanitizers.
|
||||
#
|
||||
# Usage of this module:
|
||||
# enable_sanitizers("address;leak")
|
||||
|
||||
# Add flags
|
||||
macro(enable_sanitizer santizer)
|
||||
if(${santizer} MATCHES "address")
|
||||
find_package(ASan REQUIRED)
|
||||
set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=address")
|
||||
link_libraries(${ASan_LIBRARY})
|
||||
|
||||
elseif(${santizer} MATCHES "thread")
|
||||
find_package(TSan REQUIRED)
|
||||
set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread")
|
||||
link_libraries(${TSan_LIBRARY})
|
||||
|
||||
elseif(${santizer} MATCHES "leak")
|
||||
find_package(LSan REQUIRED)
|
||||
set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=leak")
|
||||
link_libraries(${LSan_LIBRARY})
|
||||
|
||||
else()
|
||||
message(FATAL_ERROR "Santizer ${santizer} not supported.")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(enable_sanitizers SANITIZERS)
|
||||
# Check sanitizers compatibility.
|
||||
# Idealy, we should use if(san IN_LIST SANITIZERS) ... endif()
|
||||
# But I haven't figure out how to make it work.
|
||||
foreach ( _san ${SANITIZERS} )
|
||||
string(TOLOWER ${_san} _san)
|
||||
if (_san MATCHES "thread")
|
||||
if (${_use_other_sanitizers})
|
||||
message(FATAL_ERROR
|
||||
"thread sanitizer is not compatible with ${_san} sanitizer.")
|
||||
endif()
|
||||
set(_use_thread_sanitizer 1)
|
||||
else ()
|
||||
if (${_use_thread_sanitizer})
|
||||
message(FATAL_ERROR
|
||||
"${_san} sanitizer is not compatible with thread sanitizer.")
|
||||
endif()
|
||||
set(_use_other_sanitizers 1)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
message("Sanitizers: ${SANITIZERS}")
|
||||
|
||||
foreach( _san ${SANITIZERS} )
|
||||
string(TOLOWER ${_san} _san)
|
||||
enable_sanitizer(${_san})
|
||||
endforeach()
|
||||
message("Sanitizers compile flags: ${SAN_COMPILE_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_COMPILE_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_COMPILE_FLAGS}")
|
||||
endmacro()
|
||||
@@ -1,13 +0,0 @@
|
||||
set(ASan_LIB_NAME ASan)
|
||||
|
||||
find_library(ASan_LIBRARY
|
||||
NAMES libasan.so libasan.so.4
|
||||
PATHS /usr/lib64 /usr/lib /usr/local/lib64 /usr/local/lib)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(ASan DEFAULT_MSG
|
||||
ASan_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
ASan_LIBRARY
|
||||
ASan_LIB_NAME)
|
||||
79
cmake/modules/FindGTest.cmake
Normal file
79
cmake/modules/FindGTest.cmake
Normal file
@@ -0,0 +1,79 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Tries to find GTest headers and libraries.
|
||||
#
|
||||
# Usage of this module as follows:
|
||||
#
|
||||
# find_package(GTest)
|
||||
#
|
||||
# Variables used by this module, they can change the default behaviour and need
|
||||
# to be set before calling find_package:
|
||||
#
|
||||
# GTest_HOME - When set, this path is inspected instead of standard library
|
||||
# locations as the root of the GTest installation.
|
||||
# The environment variable GTEST_HOME overrides this veriable.
|
||||
#
|
||||
# This module defines
|
||||
# GTEST_INCLUDE_DIR, directory containing headers
|
||||
# GTEST_LIBS, directory containing gtest libraries
|
||||
# GTEST_STATIC_LIB, path to libgtest.a
|
||||
# GTEST_SHARED_LIB, path to libgtest's shared library
|
||||
# GTEST_FOUND, whether gtest has been found
|
||||
|
||||
find_path(GTEST_INCLUDE_DIR NAMES gtest/gtest.h gtest.h PATHS ${CMAKE_SOURCE_DIR}/gtest/include NO_DEFAULT_PATH)
|
||||
find_library(GTEST_LIBRARIES NAMES gtest PATHS ${CMAKE_SOURCE_DIR}/gtest/lib NO_DEFAULT_PATH)
|
||||
|
||||
if (GTEST_INCLUDE_DIR )
|
||||
message(STATUS "Found the GTest includes: ${GTEST_INCLUDE_DIR}")
|
||||
endif ()
|
||||
|
||||
|
||||
if (GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
|
||||
set(GTEST_FOUND TRUE)
|
||||
get_filename_component( GTEST_LIBS ${GTEST_LIBRARIES} PATH )
|
||||
set(GTEST_LIB_NAME gtest)
|
||||
set(GTEST_STATIC_LIB ${GTEST_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${GTEST_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
set(GTEST_MAIN_STATIC_LIB ${GTEST_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${GTEST_LIB_NAME}_main${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
set(GTEST_SHARED_LIB ${GTEST_LIBS}/${CMAKE_SHARED_LIBRARY_PREFIX}${GTEST_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||
else ()
|
||||
set(GTEST_FOUND FALSE)
|
||||
endif ()
|
||||
|
||||
if (GTEST_FOUND)
|
||||
if (NOT GTest_FIND_QUIETLY)
|
||||
message(STATUS "Found the GTest library: ${GTEST_LIBRARIES}")
|
||||
endif ()
|
||||
else ()
|
||||
if (NOT GTest_FIND_QUIETLY)
|
||||
set(GTEST_ERR_MSG "Could not find the GTest library. Looked in ")
|
||||
if ( _gtest_roots )
|
||||
set(GTEST_ERR_MSG "${GTEST_ERR_MSG} in ${_gtest_roots}.")
|
||||
else ()
|
||||
set(GTEST_ERR_MSG "${GTEST_ERR_MSG} system search paths.")
|
||||
endif ()
|
||||
if (GTest_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "${GTEST_ERR_MSG}")
|
||||
else (GTest_FIND_REQUIRED)
|
||||
message(STATUS "${GTEST_ERR_MSG}")
|
||||
endif (GTest_FIND_REQUIRED)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
mark_as_advanced(
|
||||
GTEST_INCLUDE_DIR
|
||||
GTEST_LIBS
|
||||
GTEST_LIBRARIES
|
||||
GTEST_STATIC_LIB
|
||||
GTEST_SHARED_LIB
|
||||
)
|
||||
@@ -1,13 +0,0 @@
|
||||
set(LSan_LIB_NAME lsan)
|
||||
|
||||
find_library(LSan_LIBRARY
|
||||
NAMES liblsan.so liblsan.so.0 liblsan.so.0.0.0
|
||||
PATHS /usr/lib64 /usr/lib /usr/local/lib64 /usr/local/lib)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(LSan DEFAULT_MSG
|
||||
LSan_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
LSan_LIBRARY
|
||||
LSan_LIB_NAME)
|
||||
@@ -1,58 +0,0 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Tries to find NCCL headers and libraries.
|
||||
#
|
||||
# Usage of this module as follows:
|
||||
#
|
||||
# find_package(NCCL)
|
||||
#
|
||||
# Variables used by this module, they can change the default behaviour and need
|
||||
# to be set before calling find_package:
|
||||
#
|
||||
# NCCL_ROOT - When set, this path is inspected instead of standard library
|
||||
# locations as the root of the NCCL installation.
|
||||
# The environment variable NCCL_ROOT overrides this veriable.
|
||||
#
|
||||
# This module defines
|
||||
# Nccl_FOUND, whether nccl has been found
|
||||
# NCCL_INCLUDE_DIR, directory containing header
|
||||
# NCCL_LIBRARY, directory containing nccl library
|
||||
# NCCL_LIB_NAME, nccl library name
|
||||
#
|
||||
# This module assumes that the user has already called find_package(CUDA)
|
||||
|
||||
|
||||
set(NCCL_LIB_NAME nccl_static)
|
||||
|
||||
find_path(NCCL_INCLUDE_DIR
|
||||
NAMES nccl.h
|
||||
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)
|
||||
|
||||
find_library(NCCL_LIBRARY
|
||||
NAMES ${NCCL_LIB_NAME}
|
||||
PATHS $ENV{NCCL_ROOT}/lib ${NCCL_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib)
|
||||
|
||||
if (NCCL_INCLUDE_DIR AND NCCL_LIBRARY)
|
||||
get_filename_component(NCCL_LIBRARY ${NCCL_LIBRARY} PATH)
|
||||
endif ()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(Nccl DEFAULT_MSG
|
||||
NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
NCCL_INCLUDE_DIR
|
||||
NCCL_LIBRARY
|
||||
NCCL_LIB_NAME
|
||||
)
|
||||
@@ -1,13 +0,0 @@
|
||||
set(TSan_LIB_NAME tsan)
|
||||
|
||||
find_library(TSan_LIBRARY
|
||||
NAMES libtsan.so libtsan.so.0 libtsan.so.0.0.0
|
||||
PATHS /usr/lib64 /usr/lib /usr/local/lib64 /usr/local/lib)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(TSan DEFAULT_MSG
|
||||
TSan_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
TSan_LIBRARY
|
||||
TSan_LIB_NAME)
|
||||
@@ -80,12 +80,6 @@ booster = gblinear
|
||||
# L2 regularization term on weights, default 0
|
||||
lambda = 0.01
|
||||
# L1 regularization term on weights, default 0
|
||||
If ```agaricus.txt.test.buffer``` exists, and automatically loads from binary buffer if possible, this can speedup training process when you do training many times. You can disable it by setting ```use_buffer=0```.
|
||||
- Buffer file can also be used as standalone input, i.e if buffer file exists, but original agaricus.txt.test was removed, xgboost will still run
|
||||
* Deviation from LibSVM input format: xgboost is compatible with LibSVM format, with the following minor differences:
|
||||
- xgboost allows feature index starts from 0
|
||||
- for binary classification, the label is 1 for positive, 0 for negative, instead of +1,-1
|
||||
- the feature indices in each line *do not* need to be sorted
|
||||
alpha = 0.01
|
||||
# L2 regularization term on bias, default 0
|
||||
lambda_bias = 0.01
|
||||
@@ -102,7 +96,7 @@ After training, we can use the output model to get the prediction of the test da
|
||||
For binary classification, the output predictions are probability confidence scores in [0,1], corresponds to the probability of the label to be positive.
|
||||
|
||||
#### Dump Model
|
||||
This is a preliminary feature, so far only tree model support text dump. XGBoost can display the tree models in text files and we can scan the model in an easy way:
|
||||
This is a preliminary feature, so only tree models support text dump. XGBoost can display the tree models in text or JSON files, and we can scan the model in an easy way:
|
||||
```
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
@@ -33,10 +33,10 @@ def logregobj(preds, dtrain):
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# return a pair metric_name, result. The metric name must not contain a colon (:)
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
|
||||
|
||||
@@ -14,15 +14,8 @@ For more usage details please refer to the [binary classification demo](../binar
|
||||
|
||||
Instructions
|
||||
====
|
||||
The dataset for ranking demo is from LETOR04 MQ2008 fold1.
|
||||
You can use the following command to run the example:
|
||||
The dataset for ranking demo is from LETOR04 MQ2008 fold1,
|
||||
You can use the following command to run the example
|
||||
|
||||
Get the data:
|
||||
```
|
||||
./wgetdata.sh
|
||||
```
|
||||
|
||||
Run the example:
|
||||
```
|
||||
./runexp.sh
|
||||
```
|
||||
Get the data: ./wgetdata.sh
|
||||
Run the example: ./runexp.sh
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
wget https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.rar
|
||||
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
|
||||
unrar x MQ2008.rar
|
||||
mv -f MQ2008/Fold1/*.txt .
|
||||
|
||||
Submodule dmlc-core updated: f2afdc7788...dadcd97fdc
377
doc/build.md
Normal file
377
doc/build.md
Normal file
@@ -0,0 +1,377 @@
|
||||
Installation Guide
|
||||
==================
|
||||
|
||||
This page gives instructions on how to build and install the xgboost package from
|
||||
scratch on various systems. It consists of two steps:
|
||||
|
||||
1. First build the shared library from the C++ codes (`libxgboost.so` for Linux/OSX and `xgboost.dll` for Windows).
|
||||
- Exception: for R-package installation please directly refer to the R package section.
|
||||
2. Then install the language packages (e.g. Python Package).
|
||||
|
||||
***Important*** the newest version of xgboost uses submodule to maintain packages. So when you clone the repo, remember to use the recursive option as follows.
|
||||
```bash
|
||||
git clone --recursive https://github.com/dmlc/xgboost
|
||||
```
|
||||
For windows users who use github tools, you can open the git shell, and type the following command.
|
||||
```bash
|
||||
git submodule init
|
||||
git submodule update
|
||||
```
|
||||
|
||||
Please refer to [Trouble Shooting Section](#trouble-shooting) first if you had any problem
|
||||
during installation. If the instructions do not work for you, please feel free
|
||||
to ask questions at [xgboost/issues](https://github.com/dmlc/xgboost/issues), or
|
||||
even better to send pull request if you can fix the problem.
|
||||
|
||||
## Contents
|
||||
- [Build the Shared Library](#build-the-shared-library)
|
||||
- [Building on Ubuntu/Debian](#building-on-ubuntu-debian)
|
||||
- [Building on macOS](#building-on-macos)
|
||||
- [Building on Windows](#building-on-windows)
|
||||
- [Building with GPU support](#building-with-gpu-support)
|
||||
- [Windows Binaries](#windows-binaries)
|
||||
- [Customized Building](#customized-building)
|
||||
- [Python Package Installation](#python-package-installation)
|
||||
- [R Package Installation](#r-package-installation)
|
||||
- [Trouble Shooting](#trouble-shooting)
|
||||
|
||||
## Build the Shared Library
|
||||
|
||||
Our goal is to build the shared library:
|
||||
- On Linux/OSX the target library is `libxgboost.so`
|
||||
- On Windows the target library is `xgboost.dll`
|
||||
|
||||
The minimal building requirement is
|
||||
|
||||
- A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
|
||||
|
||||
We can edit `make/config.mk` to change the compile options, and then build by
|
||||
`make`. If everything goes well, we can go to the specific language installation section.
|
||||
|
||||
### Building on Ubuntu/Debian
|
||||
|
||||
On Ubuntu, one builds xgboost by
|
||||
|
||||
```bash
|
||||
git clone --recursive https://github.com/dmlc/xgboost
|
||||
cd xgboost; make -j4
|
||||
```
|
||||
|
||||
### Building on macOS
|
||||
|
||||
**Install with pip - simple method**
|
||||
|
||||
First, make sure you obtained *gcc-5* (newer version does not work with this method yet). Note: installation of `gcc` can take a while (~ 30 minutes)
|
||||
|
||||
```bash
|
||||
brew install gcc5
|
||||
```
|
||||
|
||||
You might need to run the following command with `sudo` if you run into some permission errors:
|
||||
|
||||
```bash
|
||||
pip install xgboost
|
||||
```
|
||||
|
||||
**Build from the source code - advanced method**
|
||||
|
||||
First, obtain gcc-7.x.x with brew (https://brew.sh/) if you want multi-threaded version, otherwise, Clang is ok if OpenMP / multi-threaded is not required. Note: installation of `gcc` can take a while (~ 30 minutes)
|
||||
|
||||
```bash
|
||||
brew install gcc
|
||||
```
|
||||
|
||||
Now, clone the repository
|
||||
|
||||
```bash
|
||||
git clone --recursive https://github.com/dmlc/xgboost
|
||||
cd xgboost; cp make/config.mk ./config.mk
|
||||
```
|
||||
|
||||
Open config.mk and uncomment these two lines
|
||||
|
||||
```config.mk
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
```
|
||||
|
||||
and replace these two lines into(5 or 6 or 7; depending on your gcc-version)
|
||||
|
||||
```config.mk
|
||||
export CC = gcc-7
|
||||
export CXX = g++-7
|
||||
```
|
||||
|
||||
To find your gcc version
|
||||
|
||||
```bash
|
||||
gcc-version
|
||||
```
|
||||
|
||||
and build using the following commands
|
||||
|
||||
```bash
|
||||
make -j4
|
||||
```
|
||||
head over to `Python Package Installation` for the next steps
|
||||
|
||||
### Building on Windows
|
||||
You need to first clone the xgboost repo with recursive option clone the submodules.
|
||||
If you are using github tools, you can open the git-shell, and type the following command.
|
||||
We recommend using [Git for Windows](https://git-for-windows.github.io/)
|
||||
because it brings a standard bash shell. This will highly ease the installation process.
|
||||
|
||||
```bash
|
||||
git submodule init
|
||||
git submodule update
|
||||
```
|
||||
|
||||
XGBoost support both build by MSVC or MinGW. Here is how you can build xgboost library using MinGW.
|
||||
|
||||
After installing [Git for Windows](https://git-for-windows.github.io/), you should have a shortcut `Git Bash`.
|
||||
All the following steps are in the `Git Bash`.
|
||||
|
||||
In MinGW, `make` command comes with the name `mingw32-make`. You can add the following line into the `.bashrc` file.
|
||||
```bash
|
||||
alias make='mingw32-make'
|
||||
```
|
||||
(On 64-bit Windows, you should get [mingw64](https://sourceforge.net/projects/mingw-w64/) instead.) Make sure
|
||||
that the path to MinGW is in the system PATH.
|
||||
|
||||
To build with MinGW, type:
|
||||
|
||||
```bash
|
||||
cp make/mingw64.mk config.mk; make -j4
|
||||
```
|
||||
|
||||
To build with Visual Studio 2013 use cmake. Make sure you have a recent version of cmake added to your path and then from the xgboost directory:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -G"Visual Studio 12 2013 Win64"
|
||||
```
|
||||
|
||||
This specifies an out of source build using the MSVC 12 64 bit generator. Open the .sln file in the build directory and build with Visual Studio. To use the Python module you can copy `xgboost.dll` into python-package\xgboost.
|
||||
|
||||
Other versions of Visual Studio may work but are untested.
|
||||
|
||||
### Building with GPU support
|
||||
|
||||
XGBoost can be built with GPU support for both Linux and Windows using cmake. GPU support works with the Python package as well as the CLI version. See [Installing R package with GPU support](#installing-r-package-with-gpu-support) for special instructions for R.
|
||||
|
||||
An up-to-date version of the CUDA toolkit is required.
|
||||
|
||||
From the command line on Linux starting from the xgboost directory:
|
||||
|
||||
```bash
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake .. -DUSE_CUDA=ON
|
||||
$ make -j
|
||||
```
|
||||
**Windows requirements** for GPU build: only Visual C++ 2015 or 2013 with CUDA v8.0 were fully tested. Either install Visual C++ 2015 Build Tools separately, or as a part of Visual Studio 2015. If you already have Visual Studio 2017, the Visual C++ 2015 Toolchain componenet has to be installed using the VS 2017 Installer. Likely, you would need to use the VS2015 x64 Native Tools command prompt to run the cmake commands given below. In some situations, however, things run just fine from MSYS2 bash command line.
|
||||
|
||||
On Windows, using cmake, see what options for Generators you have for cmake, and choose one with [arch] replaced by Win64:
|
||||
```bash
|
||||
cmake -help
|
||||
```
|
||||
Then run cmake as:
|
||||
```bash
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake .. -G"Visual Studio 14 2015 Win64" -DUSE_CUDA=ON
|
||||
```
|
||||
To speed up compilation, compute version specific to your GPU could be passed to cmake as, e.g., `-DGPU_COMPUTE_VER=50`.
|
||||
The above cmake configuration run will create an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build, either from Visual studio or from command line:
|
||||
```
|
||||
cmake --build . --target xgboost --config Release
|
||||
```
|
||||
If build seems to use only a single process, you might try to append an option like ` -- /m:6` to the above command.
|
||||
|
||||
### Windows Binaries
|
||||
|
||||
After the build process successfully ends, you will find a `xgboost.dll` library file inside `./lib/` folder, copy this file to the the API package folder like `python-package/xgboost` if you are using *python* API. And you are good to follow the below instructions.
|
||||
|
||||
Unofficial windows binaries and instructions on how to use them are hosted on [Guido Tapia's blog](http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/)
|
||||
|
||||
### Customized Building
|
||||
|
||||
The configuration of xgboost can be modified by ```config.mk```
|
||||
- modify configuration on various distributed filesystem such as HDFS/Amazon S3/...
|
||||
- First copy [make/config.mk](../make/config.mk) to the project root, on which
|
||||
any local modification will be ignored by git, then modify the according flags.
|
||||
|
||||
|
||||
|
||||
## Python Package Installation
|
||||
|
||||
The python package is located at [python-package](../python-package).
|
||||
There are several ways to install the package:
|
||||
|
||||
1. Install system-widely, which requires root permission
|
||||
|
||||
```bash
|
||||
cd python-package; sudo python setup.py install
|
||||
```
|
||||
|
||||
You will however need Python `distutils` module for this to
|
||||
work. It is often part of the core python package or it can be installed using your
|
||||
package manager, e.g. in Debian use
|
||||
|
||||
```bash
|
||||
sudo apt-get install python-setuptools
|
||||
```
|
||||
|
||||
*NOTE: If you recompiled xgboost, then you need to reinstall it again to
|
||||
make the new library take effect*
|
||||
|
||||
2. Only set the environment variable `PYTHONPATH` to tell python where to find
|
||||
the library. For example, assume we cloned `xgboost` on the home directory
|
||||
`~`. then we can added the following line in `~/.bashrc`.
|
||||
It is ***recommended for developers*** who may change the codes. The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ```setup``` again)
|
||||
|
||||
```bash
|
||||
export PYTHONPATH=~/xgboost/python-package
|
||||
```
|
||||
|
||||
3. Install only for the current user.
|
||||
|
||||
```bash
|
||||
cd python-package; python setup.py develop --user
|
||||
```
|
||||
|
||||
4. If you are installing the latest xgboost version which requires compilation, add MinGW to the system PATH:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ['PATH'] = os.environ['PATH'] + ';C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
|
||||
```
|
||||
|
||||
## R Package Installation
|
||||
|
||||
### Installing pre-packaged version
|
||||
|
||||
You can install xgboost from CRAN just like any other R package:
|
||||
|
||||
```r
|
||||
install.packages("xgboost")
|
||||
```
|
||||
|
||||
Or you can install it from our weekly updated drat repo:
|
||||
|
||||
```r
|
||||
install.packages("drat", repos="https://cran.rstudio.com")
|
||||
drat:::addRepo("dmlc")
|
||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||
```
|
||||
|
||||
For OSX users, single threaded version will be installed. To install multi-threaded version,
|
||||
first follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then:
|
||||
|
||||
- Set the `Makevars` file in highest piority for R.
|
||||
|
||||
The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!).
|
||||
|
||||
Then inside R, run
|
||||
|
||||
```R
|
||||
install.packages("drat", repos="https://cran.rstudio.com")
|
||||
drat:::addRepo("dmlc")
|
||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||
```
|
||||
|
||||
### Installing the development version
|
||||
|
||||
Make sure you have installed git and a recent C++ compiler supporting C++11 (e.g., g++-4.8 or higher).
|
||||
On Windows, Rtools must be installed, and its bin directory has to be added to PATH during the installation.
|
||||
And see the previous subsection for an OSX tip.
|
||||
|
||||
Due to the use of git-submodules, `devtools::install_github` can no longer be used to install the latest version of R package.
|
||||
Thus, one has to run git to check out the code first:
|
||||
|
||||
```bash
|
||||
git clone --recursive https://github.com/dmlc/xgboost
|
||||
cd xgboost
|
||||
git submodule init
|
||||
git submodule update
|
||||
cd R-package
|
||||
R CMD INSTALL .
|
||||
```
|
||||
|
||||
If the last line fails because of "R: command not found", it means that R was not set up to run from command line.
|
||||
In this case, just start R as you would normally do and run the following:
|
||||
|
||||
```r
|
||||
setwd('wherever/you/cloned/it/xgboost/R-package/')
|
||||
install.packages('.', repos = NULL, type="source")
|
||||
```
|
||||
|
||||
The package could also be built and installed with cmake (and Visual C++ 2015 on Windows) using instructions from the next section, but without GPU support (omit the `-DUSE_CUDA=ON` cmake parameter).
|
||||
|
||||
If all fails, try [building the shared library](#build-the-shared-library) to see whether a problem is specific to R package or not.
|
||||
|
||||
### Installing R package with GPU support
|
||||
|
||||
The procedure and requirements are similar as in [Building with GPU support](#building-with-gpu-support), so make sure to read it first.
|
||||
|
||||
On Linux, starting from the xgboost directory:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DUSE_CUDA=ON -DR_LIB=ON
|
||||
make install -j
|
||||
```
|
||||
When default target is used, an R package shared library would be built in the `build` area.
|
||||
The `install` target, in addition, assembles the package files with this shared library under `build/R-package`, and runs `R CMD INSTALL`.
|
||||
|
||||
On Windows, cmake with Visual C++ Build Tools (or Visual Studio) has to be used to build an R package with GPU support. Rtools must also be installed (perhaps, some other MinGW distributions with `gendef.exe` and `dlltool.exe` would work, but that was not tested).
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -G"Visual Studio 14 2015 Win64" -DUSE_CUDA=ON -DR_LIB=ON
|
||||
cmake --build . --target install --config Release
|
||||
```
|
||||
When `--target xgboost` is used, an R package dll would be built under `build/Release`.
|
||||
The `--target install`, in addition, assembles the package files with this dll under `build/R-package`, and runs `R CMD INSTALL`.
|
||||
|
||||
If cmake can't find your R during the configuration step, you might provide the location of its executable to cmake like this: `-DLIBR_EXECUTABLE="C:/Program Files/R/R-3.4.1/bin/x64/R.exe"`.
|
||||
|
||||
If on Windows you get a "permission denied" error when trying to write to ...Program Files/R/... during the package installation, create a `.Rprofile` file in your personal home directory (if you don't already have one in there), and add a line to it which specifies the location of your R packages user library, like the following:
|
||||
```r
|
||||
.libPaths( unique(c("C:/Users/USERNAME/Documents/R/win-library/3.4", .libPaths())))
|
||||
```
|
||||
You might find the exact location by running `.libPaths()` in R GUI or RStudio.
|
||||
|
||||
## Trouble Shooting
|
||||
|
||||
1. **Compile failed after `git pull`**
|
||||
|
||||
Please first update the submodules, clean all and recompile:
|
||||
|
||||
```bash
|
||||
git submodule update && make clean_all && make -j4
|
||||
```
|
||||
|
||||
2. **Compile failed after `config.mk` is modified**
|
||||
|
||||
Need to clean all first:
|
||||
|
||||
```bash
|
||||
make clean_all && make -j4
|
||||
```
|
||||
|
||||
|
||||
3. **Makefile: dmlc-core/make/dmlc.mk: No such file or directory**
|
||||
|
||||
We need to recursively clone the submodule, you can do:
|
||||
|
||||
```bash
|
||||
git submodule init
|
||||
git submodule update
|
||||
```
|
||||
Alternatively, do another clone
|
||||
```bash
|
||||
git clone https://github.com/dmlc/xgboost --recursive
|
||||
```
|
||||
@@ -4,17 +4,15 @@ Installation Guide
|
||||
|
||||
.. note:: Pre-built binary wheel for Python
|
||||
|
||||
If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
|
||||
If you are planning to use Python on a Linux system, consider installing XGBoost from a pre-built binary wheel. The wheel is available from Python Package Index (PyPI). You may download and install it by running
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Ensure that you are downloading one of the following:
|
||||
# * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
|
||||
# * xgboost-{version}-py2.py3-none-win_amd64.whl
|
||||
# Ensure that you are downloading xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
|
||||
pip3 install xgboost
|
||||
|
||||
* The binary wheel will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs. **However, it will not support multi-GPU training; only single GPU will be used.** To enable multi-GPU training, download and install the binary wheel from `this page <https://s3-us-west-2.amazonaws.com/xgboost-wheels/list.html>`_.
|
||||
* Currently, we provide binary wheels for 64-bit Linux and Windows.
|
||||
* This package will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs.
|
||||
* Currently, PyPI has a binary wheel only for 64-bit Linux.
|
||||
|
||||
****************************
|
||||
Building XGBoost from source
|
||||
@@ -189,15 +187,13 @@ After the build process successfully ends, you will find a ``xgboost.dll`` libra
|
||||
|
||||
Unofficial windows binaries and instructions on how to use them are hosted on `Guido Tapia's blog <http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/>`_.
|
||||
|
||||
.. _build_gpu_support:
|
||||
|
||||
Building with GPU support
|
||||
=========================
|
||||
XGBoost can be built with GPU support for both Linux and Windows using CMake. GPU support works with the Python package as well as the CLI version. See `Installing R package with GPU support`_ for special instructions for R.
|
||||
|
||||
An up-to-date version of the CUDA toolkit is required.
|
||||
|
||||
From the command line on Linux starting from the XGBoost directory:
|
||||
From the command line on Linux starting from the xgboost directory:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -206,16 +202,9 @@ From the command line on Linux starting from the XGBoost directory:
|
||||
cmake .. -DUSE_CUDA=ON
|
||||
make -j
|
||||
|
||||
.. note:: Enabling multi-GPU training
|
||||
.. note:: Windows requirements for GPU build
|
||||
|
||||
By default, multi-GPU training is disabled and only a single GPU will be used. To enable multi-GPU training, set the option ``USE_NCCL=ON``. Multi-GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **multi-GPU training is available only for Linux**.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON
|
||||
make -j
|
||||
Only Visual C++ 2015 or 2013 with CUDA v8.0 were fully tested. Either install Visual C++ 2015 Build Tools separately, or as a part of Visual Studio 2015. If you already have Visual Studio 2017, the Visual C++ 2015 Toolchain componenet has to be installed using the VS 2017 Installer. Likely, you would need to use the VS2015 x64 Native Tools command prompt to run the cmake commands given below. In some situations, however, things run just fine from MSYS2 bash command line.
|
||||
|
||||
On Windows, see what options for generators you have for CMake, and choose one with ``[arch]`` replaced with Win64:
|
||||
|
||||
|
||||
10
doc/conf.py
10
doc/conf.py
@@ -14,6 +14,7 @@
|
||||
from subprocess import call
|
||||
from sh.contrib import git
|
||||
import urllib.request
|
||||
from urllib.error import HTTPError
|
||||
from recommonmark.parser import CommonMarkParser
|
||||
import sys
|
||||
import re
|
||||
@@ -24,8 +25,11 @@ import guzzle_sphinx_theme
|
||||
git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
|
||||
git_branch = [x for x in git_branch if 'HEAD' not in x]
|
||||
print('git_branch = {}'.format(git_branch[0]))
|
||||
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
|
||||
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
|
||||
try:
|
||||
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
|
||||
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
|
||||
except HTTPError:
|
||||
print('JVM doc not found. Skipping...')
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
@@ -146,7 +150,7 @@ extensions.append("guzzle_sphinx_theme")
|
||||
# Guzzle theme options (see theme.conf for more information)
|
||||
html_theme_options = {
|
||||
# Set the name of the project to appear in the sidebar
|
||||
"project_nav_name": "XGBoost"
|
||||
"project_nav_name": "XGBoost (0.72)"
|
||||
}
|
||||
|
||||
html_sidebars = {
|
||||
|
||||
@@ -18,7 +18,6 @@ Everyone is more than welcome to contribute. It is a way to make the project bet
|
||||
|
||||
* `Documents`_
|
||||
* `Testcases`_
|
||||
* `Sanitizers`_
|
||||
* `Examples`_
|
||||
* `Core Library`_
|
||||
* `Python Package`_
|
||||
@@ -122,46 +121,6 @@ Testcases
|
||||
* All the testcases are in `tests <https://github.com/dmlc/xgboost/tree/master/tests>`_.
|
||||
* We use python nose for python test cases.
|
||||
|
||||
**********
|
||||
Sanitizers
|
||||
**********
|
||||
|
||||
By default, sanitizers are bundled in GCC and Clang/LLVM. One can enable
|
||||
sanitizers with GCC >= 4.8 or LLVM >= 3.1, But some distributions might package
|
||||
sanitizers separately. Here is a list of supported sanitizers with
|
||||
corresponding library names:
|
||||
|
||||
- Address sanitizer: libasan
|
||||
- Leak sanitizer: liblsan
|
||||
- Thread sanitizer: libtsan
|
||||
|
||||
Memory sanitizer is exclusive to LLVM, hence not supported in XGBoost.
|
||||
|
||||
How to build XGBoost with sanitizers
|
||||
====================================
|
||||
One can build XGBoost with sanitizer support by specifying -DUSE_SANITIZER=ON.
|
||||
By default, address sanitizer and leak sanitizer are used when you turn the
|
||||
USE_SANITIZER flag on. You can always change the default by providing a
|
||||
semicolon separated list of sanitizers to ENABLED_SANITIZERS. Note that thread
|
||||
sanitizer is not compatible with the other two sanitizers.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" /path/to/xgboost
|
||||
|
||||
How to use sanitizers with CUDA support
|
||||
=======================================
|
||||
Runing XGBoost on CUDA with address sanitizer (asan) will raise memory error.
|
||||
To use asan with CUDA correctly, you need to configure asan via ASAN_OPTIONS
|
||||
environment variable:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ASAN_OPTIONS=protect_shadow_gap=0 ../testxgboost
|
||||
|
||||
For details, please consult `official documentation <https://github.com/google/sanitizers/wiki>`_ for sanitizers.
|
||||
|
||||
|
||||
********
|
||||
Examples
|
||||
********
|
||||
|
||||
79
doc/get_started/index.md
Normal file
79
doc/get_started/index.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Get Started with XGBoost
|
||||
|
||||
This is a quick start tutorial showing snippets for you to quickly try out xgboost
|
||||
on the demo dataset on a binary classification task.
|
||||
|
||||
## Links to Helpful Other Resources
|
||||
- See [Installation Guide](../build.md) on how to install xgboost.
|
||||
- See [How to pages](../how_to/index.md) on various tips on using xgboost.
|
||||
- See [Tutorials](../tutorials/index.md) on tutorials on specific tasks.
|
||||
- See [Learning to use XGBoost by Examples](../../demo) for more code examples.
|
||||
|
||||
## Python
|
||||
```python
|
||||
import xgboost as xgb
|
||||
# read in data
|
||||
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
|
||||
# specify parameters via map
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round)
|
||||
# make prediction
|
||||
preds = bst.predict(dtest)
|
||||
```
|
||||
|
||||
## R
|
||||
|
||||
```r
|
||||
# load data
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# fit model
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# predict
|
||||
pred <- predict(bst, test$data)
|
||||
```
|
||||
|
||||
## Julia
|
||||
```julia
|
||||
using XGBoost
|
||||
# read data
|
||||
train_X, train_Y = readlibsvm("demo/data/agaricus.txt.train", (6513, 126))
|
||||
test_X, test_Y = readlibsvm("demo/data/agaricus.txt.test", (1611, 126))
|
||||
# fit model
|
||||
num_round = 2
|
||||
bst = xgboost(train_X, num_round, label=train_Y, eta=1, max_depth=2)
|
||||
# predict
|
||||
pred = predict(bst, test_X)
|
||||
```
|
||||
|
||||
## Scala
|
||||
```scala
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.XGBoost
|
||||
|
||||
object XGBoostScalaExample {
|
||||
def main(args: Array[String]) {
|
||||
// read trainining data, available at xgboost/demo/data
|
||||
val trainData =
|
||||
new DMatrix("/path/to/agaricus.txt.train")
|
||||
// define parameters
|
||||
val paramMap = List(
|
||||
"eta" -> 0.1,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "binary:logistic").toMap
|
||||
// number of iterations
|
||||
val round = 2
|
||||
// train the model
|
||||
val model = XGBoost.train(trainData, paramMap, round)
|
||||
// run prediction
|
||||
val predTrain = model.predict(trainData)
|
||||
// save model to the file.
|
||||
model.saveModel("/local/path/to/model")
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -5,10 +5,16 @@ XGBoost GPU Support
|
||||
This page contains information about GPU algorithms supported in XGBoost.
|
||||
To install GPU support, checkout the :doc:`/build`.
|
||||
|
||||
.. note:: CUDA 8.0, Compute Capability 3.5 required
|
||||
|
||||
The GPU algorithms in XGBoost require a graphics card with compute capability 3.5 or higher, with
|
||||
CUDA toolkits 8.0 or later.
|
||||
(See `this list <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_ to look up compute capability of your GPU card.)
|
||||
|
||||
*********************************************
|
||||
CUDA Accelerated Tree Construction Algorithms
|
||||
*********************************************
|
||||
Tree construction (training) and prediction can be accelerated with CUDA-capable GPUs.
|
||||
This plugin adds GPU accelerated tree construction and prediction algorithms to XGBoost.
|
||||
|
||||
Usage
|
||||
=====
|
||||
@@ -59,11 +65,7 @@ The device ordinal can be selected using the ``gpu_id`` parameter, which default
|
||||
|
||||
Multiple GPUs can be used with the ``gpu_hist`` tree method using the ``n_gpus`` parameter. which defaults to 1. If this is set to -1 all available GPUs will be used. If ``gpu_id`` is specified as non-zero, the gpu device order is ``mod(gpu_id + i) % n_visible_devices`` for ``i=0`` to ``n_gpus-1``. As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance.
|
||||
|
||||
.. note:: Enabling multi-GPU training
|
||||
|
||||
Default installation may not enable multi-GPU training. To use multiple GPUs, make sure to read :ref:`build_gpu_support`.
|
||||
|
||||
The GPU algorithms currently work with CLI, Python and R packages. See :doc:`/build` for details.
|
||||
This plugin currently works with the CLI, python and R - see :doc:`/build` for details.
|
||||
|
||||
.. code-block:: python
|
||||
:caption: Python example
|
||||
|
||||
17
doc/how_to/index.md
Normal file
17
doc/how_to/index.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# XGBoost How To
|
||||
|
||||
This page contains guidelines to use and develop XGBoost.
|
||||
|
||||
## Installation
|
||||
- [How to Install XGBoost](../build.md)
|
||||
|
||||
## Use XGBoost in Specific Ways
|
||||
- [Parameter tuning guide](param_tuning.md)
|
||||
- [Use out of core computation for large dataset](external_memory.md)
|
||||
- [Use XGBoost GPU algorithms](../gpu/index.md)
|
||||
|
||||
## Develop and Hack XGBoost
|
||||
- [Contribute to XGBoost](contribute.md)
|
||||
|
||||
## Frequently Ask Questions
|
||||
- [FAQ](../faq.md)
|
||||
56
doc/input_format.md
Normal file
56
doc/input_format.md
Normal file
@@ -0,0 +1,56 @@
|
||||
Text Input Format of DMatrix
|
||||
============================
|
||||
|
||||
## Basic Input Format
|
||||
As we have mentioned, XGBoost takes LibSVM format. For training or predicting, XGBoost takes an instance file with the format as below:
|
||||
|
||||
train.txt
|
||||
```
|
||||
1 101:1.2 102:0.03
|
||||
0 1:2.1 10001:300 10002:400
|
||||
0 0:1.3 1:0.3
|
||||
1 0:0.01 1:0.3
|
||||
0 0:0.2 1:0.3
|
||||
```
|
||||
Each line represent a single instance, and in the first line '1' is the instance label,'101' and '102' are feature indices, '1.2' and '0.03' are feature values. In the binary classification case, '1' is used to indicate positive samples, and '0' is used to indicate negative samples. We also support probability values in [0,1] as label, to indicate the probability of the instance being positive.
|
||||
|
||||
Additional Information
|
||||
----------------------
|
||||
Note: these additional information are only applicable to single machine version of the package.
|
||||
|
||||
### Group Input Format
|
||||
As XGBoost supports accomplishing [ranking task](../demo/rank), we support the group input format. In ranking task, instances are categorized into different groups in real world scenarios, for example, in the learning to rank web pages scenario, the web page instances are grouped by their queries. Except the instance file mentioned in the group input format, XGBoost need an file indicating the group information. For example, if the instance file is the "train.txt" shown above,
|
||||
and the group file is as below:
|
||||
|
||||
train.txt.group
|
||||
```
|
||||
2
|
||||
3
|
||||
```
|
||||
This means that, the data set contains 5 instances, and the first two instances are in a group and the other three are in another group. The numbers in the group file are actually indicating the number of instances in each group in the instance file in order.
|
||||
While configuration, you do not have to indicate the path of the group file. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.group" in the same directory and decides whether to read the data as group input format.
|
||||
|
||||
### Instance Weight File
|
||||
XGBoost supports providing each instance an weight to differentiate the importance of instances. For example, if we provide an instance weight file for the "train.txt" file in the example as below:
|
||||
|
||||
train.txt.weight
|
||||
```
|
||||
1
|
||||
0.5
|
||||
0.5
|
||||
1
|
||||
0.5
|
||||
```
|
||||
It means that XGBoost will emphasize more on the first and fourth instance, that is to say positive instances while training.
|
||||
The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost.
|
||||
|
||||
### Initial Margin file
|
||||
XGBoost supports providing each instance an initial margin prediction. For example, if we have a initial prediction using logistic regression for "train.txt" file, we can create the following file:
|
||||
|
||||
train.txt.base_margin
|
||||
```
|
||||
-0.4
|
||||
1.0
|
||||
3.4
|
||||
```
|
||||
XGBoost will take these values as initial margin prediction and boost from that. An important note about base_margin is that it should be margin prediction before transformation, so if you are doing logistic loss, you will need to put in value before logistic transformation. If you are using XGBoost predictor, use pred_margin=1 to output margin values.
|
||||
@@ -58,9 +58,10 @@ For sbt, please add the repository and dependency in build.sbt as following:
|
||||
|
||||
If you want to use XGBoost4J-Spark, replace ``xgboost4j`` with ``xgboost4j-spark``.
|
||||
|
||||
.. note:: XGBoost4J-Spark requires Spark 2.3+
|
||||
.. note:: Spark 2.0 Required
|
||||
|
||||
After integrating with Dataframe/Dataset APIs of Spark 2.0, XGBoost4J-Spark only supports compile with Spark 2.x. You can build XGBoost4J-Spark as a component of XGBoost4J by running ``mvn package``, and you can specify the version of spark with ``mvn -Dspark.version=2.0.0 package``. (To continue working with Spark 1.x, the users are supposed to update pom.xml by modifying the properties like ``spark.version``, ``scala.version``, and ``scala.binary.version``. Users also need to change the implementation by replacing ``SparkSession`` with ``SQLContext`` and the type of API parameters from ``Dataset[_]`` to ``Dataframe``)
|
||||
|
||||
XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
|
||||
|
||||
Installation from maven repo
|
||||
============================
|
||||
@@ -147,7 +148,6 @@ Contents
|
||||
:maxdepth: 2
|
||||
|
||||
java_intro
|
||||
XGBoost4J-Spark Tutorial <xgboost4j_spark_tutorial>
|
||||
Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
|
||||
XGBoost4J Java API <javadocs/index>
|
||||
XGBoost4J Scala API <scaladocs/xgboost4j/index>
|
||||
|
||||
@@ -1,513 +0,0 @@
|
||||
#######################################
|
||||
XGBoost4J-Spark Tutorial (version 0.8+)
|
||||
#######################################
|
||||
|
||||
**XGBoost4J-Spark** is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for:
|
||||
|
||||
* Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc.
|
||||
* Pipelines: constructing, evaluating, and tuning ML Pipelines
|
||||
* Persistence: persist and load machine learning models and even whole Pipelines
|
||||
|
||||
This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss
|
||||
|
||||
* Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface
|
||||
* Training a XGBoost model with XGBoost4J-Spark
|
||||
* Serving XGBoost model (prediction) with Spark
|
||||
* Building a Machine Learning Pipeline with XGBoost4J-Spark
|
||||
* Running XGBoost4J-Spark in Production
|
||||
|
||||
.. contents::
|
||||
:backlinks: none
|
||||
:local:
|
||||
|
||||
********************************************
|
||||
Build an ML Application with XGBoost4J-Spark
|
||||
********************************************
|
||||
|
||||
Refer to XGBoost4J-Spark Dependency
|
||||
===================================
|
||||
|
||||
Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central.
|
||||
|
||||
You can add the following dependency in your ``pom.xml``.
|
||||
|
||||
.. code-block:: xml
|
||||
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-spark</artifactId>
|
||||
<version>latest_version_num</version>
|
||||
</dependency>
|
||||
|
||||
For the latest release version number, please check `here <https://github.com/dmlc/xgboost/releases>`_.
|
||||
|
||||
We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in ``pom.xml``:
|
||||
|
||||
.. code-block:: xml
|
||||
|
||||
<repository>
|
||||
<id>XGBoost4J-Spark Snapshot Repo</id>
|
||||
<name>XGBoost4J-Spark Snapshot Repo</name>
|
||||
<url>https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/</url>
|
||||
</repository>
|
||||
|
||||
and then refer to the snapshot dependency by adding:
|
||||
|
||||
.. code-block:: xml
|
||||
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j</artifactId>
|
||||
<version>next_version_num-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
.. note:: XGBoost4J-Spark requires Spark 2.3+
|
||||
|
||||
XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
|
||||
|
||||
Data Preparation
|
||||
================
|
||||
|
||||
As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables
|
||||
users to apply various types of transformation over the training/test datasets with the convenient
|
||||
and powerful data processing framework, Spark.
|
||||
|
||||
In this section, we use `Iris <https://archive.ics.uci.edu/ml/datasets/iris>`_ dataset as an example to
|
||||
showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost.
|
||||
|
||||
Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width",
|
||||
"petal length" and "petal width". In addition, it contains the "class" columnm, which is essentially the label with three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica".
|
||||
|
||||
Read Dataset with Spark's Built-In Reader
|
||||
-----------------------------------------
|
||||
|
||||
The first thing in data transformation is to load the dataset as Spark's structured data abstraction, DataFrame.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
|
||||
|
||||
val spark = SparkSession.builder().getOrCreate()
|
||||
val schema = new StructType(Array(
|
||||
StructField("sepal length", DoubleType, true),
|
||||
StructField("sepal width", DoubleType, true),
|
||||
StructField("petal length", DoubleType, true),
|
||||
StructField("petal width", DoubleType, true),
|
||||
StructField("class", StringType, true)))
|
||||
val rawInput = spark.read.schema(schema).csv("input_path")
|
||||
|
||||
At the first line, we create a instance of `SparkSession <http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
|
||||
|
||||
Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM.
|
||||
|
||||
Transform Raw Iris Dataset
|
||||
--------------------------
|
||||
|
||||
To make Iris dataset be recognizable to XGBoost, we need to
|
||||
|
||||
1. Transform String-typed label, i.e. "class", to Double-typed label.
|
||||
2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework.
|
||||
|
||||
To convert String-typed label to Double, we can use Spark's built-in feature transformer `StringIndexer <https://spark.apache.org/docs/2.3.1/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer>`_.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.ml.feature.StringIndexer
|
||||
val stringIndexer = new StringIndexer().
|
||||
setInputCol("class").
|
||||
setOutputCol("classIndex").
|
||||
fit(rawInput)
|
||||
val labelTransformed = stringIndexer.transform(rawInput).drop("class")
|
||||
|
||||
With a newly created StringIndexer instance:
|
||||
|
||||
1. we set input column, i.e. the column containing String-typed label
|
||||
2. we set output column, i.e. the column to contain the Double-typed label.
|
||||
3. Then we ``fit`` StringIndex with our input DataFrame ``rawInput``, so that Spark internals can get information like total number of distinct values, etc.
|
||||
|
||||
Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we ``transform`` the input DataFrame ``rawInput`` and to keep a concise DataFrame,
|
||||
we drop the column "class" and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet).
|
||||
|
||||
The ``fit`` and ``transform`` are two key operations in MLLIB. Basically, ``fit`` produces a "transformer", e.g. StringIndexer, and each transformer applies ``transform`` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about ``fit`` and ``transform``, You can find more details in `here <http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components>`_.
|
||||
|
||||
Similarly, we can use another transformer, `VectorAssembler <https://spark.apache.org/docs/2.3.1/api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler>`_, to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.ml.feature.VectorAssembler
|
||||
val vectorAssembler = new VectorAssembler().
|
||||
setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")).
|
||||
setOutputCol("features")
|
||||
val xgbInput = vectorAssembler.transform(labelTransformed).select("features", "classIndex")
|
||||
|
||||
Now, we have a DataFrame containing only two columns, "features" which contains vector-represented
|
||||
"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed
|
||||
labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly.
|
||||
|
||||
Training
|
||||
========
|
||||
|
||||
XGBoost supports both regression and classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes classification problem, the usage in Regression is very similar to classification.
|
||||
|
||||
To train a XGBoost model for classification, we need to claim a XGBoostClassifier first:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
|
||||
val xgbParam = Map("eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "multi:softprob",
|
||||
"num_class" -> 3,
|
||||
"num_round" -> 100,
|
||||
"num_workers" -> 2)
|
||||
val xgbClassifier = new XGBoostClassifier(xgbParam).
|
||||
setFeaturesCol("features").
|
||||
setLabelCol("classIndex")
|
||||
|
||||
The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`. In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case variant of these parameters to keep consistent with Spark's MLLIB parameters.
|
||||
|
||||
Specifically, each parameter in :doc:`this page </parameter>` has its
|
||||
equivalent form in XGBoost4J-Spark with camel case. For example, to set ``max_depth`` for each tree, you can pass parameter just like what we did in the above code snippet (as ``max_depth`` wrapped in a Map), or you can do it through setters in XGBoostClassifer:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbClassifier = new XGBoostClassifier().
|
||||
setFeaturesCol("features").
|
||||
setLabelCol("classIndex")
|
||||
xgbClassifier.setMaxDepth(2)
|
||||
|
||||
After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This ``fit`` operation is essentially the training process and the generated model can then be used in prediction.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbClassificationModel = xgbClassifier.fit(xgbInput)
|
||||
|
||||
Prediction
|
||||
==========
|
||||
|
||||
XGBoost4j-Spark supports two ways for model serving: batch prediction and single instance prediction.
|
||||
|
||||
Batch Prediction
|
||||
----------------
|
||||
|
||||
When we get a model, either XGBoostClassificationModel or XGBoostRegressionModel, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default:
|
||||
|
||||
* XGBoostClassificationModel will output margins (``rawPredictionCol``), probabilities(``probabilityCol``) and the eventual prediction labels (``predictionCol``) for each possible label.
|
||||
* XGBoostRegressionModel will output prediction label(``predictionCol``).
|
||||
|
||||
Batch prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbClassificationModel = xgbClassifier.fit(xgbInput)
|
||||
val results = xgbClassificationModel.transform(testSet)
|
||||
|
||||
With the above code snippet, we get a result DataFrame, result containing margin, probability for each class and the prediction for each instance
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
+-----------------+----------+--------------------+--------------------+----------+
|
||||
| features|classIndex| rawPrediction| probability|prediction|
|
||||
+-----------------+----------+--------------------+--------------------+----------+
|
||||
|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0|
|
||||
|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0|
|
||||
|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0|
|
||||
|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0|
|
||||
|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0|
|
||||
|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0|
|
||||
|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0|
|
||||
|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0|
|
||||
|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0|
|
||||
|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0|
|
||||
|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0|
|
||||
|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0|
|
||||
|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0|
|
||||
|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0|
|
||||
|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0|
|
||||
|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0|
|
||||
|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0|
|
||||
|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0|
|
||||
|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0|
|
||||
|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0|
|
||||
+-----------------+----------+--------------------+--------------------+----------+
|
||||
|
||||
Single instance prediction
|
||||
--------------------------
|
||||
|
||||
XGBoostClassificationModel or XGBoostRegressionModel support make prediction on single instance as well.
|
||||
It accepts a single Vector as feature, and output the prediction label.
|
||||
|
||||
However, the overhead of single-instance prediction is high due to the internal overhead of XGBoost, use it carefully!
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val features = xgbInput.head().getAs[Vector]("features")
|
||||
val result = xgbClassificationModel.predict(features)
|
||||
|
||||
Model Persistence
|
||||
=================
|
||||
|
||||
Model and pipeline persistence
|
||||
------------------------------
|
||||
|
||||
A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. Reversely, a trained model may be used by data scientists, for example as a baseline, across the process of data exploration. So it's important to support model persistence to make the models available across usage scenarios and programming languages.
|
||||
|
||||
XGBoost4j-Spark supports saving and loading XGBoostClassifier/XGBoostClassificationModel and XGBoostRegressor/XGBoostRegressionModel. It also supports saving and loading a ML pipeline which includes these estimators and models.
|
||||
|
||||
We can save the XGBoostClassificationModel to file system:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbClassificationModelPath = "/tmp/xgbClassificationModel"
|
||||
xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath)
|
||||
|
||||
and then loading the model in another session:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel
|
||||
|
||||
val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath)
|
||||
xgbClassificationModel2.transform(xgbInput)
|
||||
|
||||
With regards to ML pipeline save and load, please refer the next section.
|
||||
|
||||
Interact with Other Bindings of XGBoost
|
||||
------------------------------------
|
||||
After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val nativeModelPath = "/tmp/nativeModel"
|
||||
xgbClassificationModel.nativeBooster.saveModel(nativeModelPath)
|
||||
|
||||
Then we can load this model with single node Python XGBoost:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import xgboost as xgb
|
||||
bst = xgb.Booster({'nthread': 4})
|
||||
bst.load_model(nativeModelPath)
|
||||
|
||||
.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel()
|
||||
|
||||
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
|
||||
|
||||
1. Build XGBoost4J-Spark with the steps described in `here <https://xgboost.readthedocs.io/en/latest/jvm/index.html#installation-from-source>`_, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
||||
|
||||
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
|
||||
|
||||
2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example):
|
||||
|
||||
- Create a new file with
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val outputStream = fs.create("hdfs_path")
|
||||
|
||||
where "fs" is an instance of `org.apache.hadoop.fs.FileSystem <https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html>`_ class in Hadoop.
|
||||
|
||||
- Pass the returned OutputStream in the first step to nativeBooster.saveModel():
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
xgbClassificationModel.nativeBooster.saveModel(outputStream)
|
||||
|
||||
- Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import xgboost as xgb
|
||||
bst = xgb.Booster({'nthread': 4})
|
||||
local_path = download_from_hdfs("hdfs_path")
|
||||
bst.load_model(local_path)
|
||||
|
||||
.. note:: Consistency issue between XGBoost4J-Spark and other bindings
|
||||
|
||||
There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost.
|
||||
|
||||
When users use Spark to load training/test data in LIBSVM format with the following code snippet:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
spark.read.format("libsvm").load("trainingset_libsvm")
|
||||
|
||||
Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0) by default. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API, or you append ``?indexing_mode=1`` to your file path when loading with DMatirx. For example in Python:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
xgb.DMatrix('test.libsvm?indexing_mode=1')
|
||||
|
||||
*******************************************
|
||||
Building a ML Pipeline with XGBoost4J-Spark
|
||||
*******************************************
|
||||
|
||||
Basic ML Pipeline
|
||||
=================
|
||||
|
||||
Spark ML pipeline can combine multiple algorithms or functions into a single pipeline.
|
||||
It covers from feature extraction, transformation, selection to model training and prediction.
|
||||
XGBoost4j-Spark makes it feasible to embed XGBoost into such a pipeline seamlessly.
|
||||
The following example shows how to build such a pipeline consisting of Spark MLlib feature transformer
|
||||
and XGBoostClassifier estimator.
|
||||
|
||||
We still use `Iris <https://archive.ics.uci.edu/ml/datasets/iris>`_ dataset and the ``rawInput`` DataFrame.
|
||||
First we need to split the dataset into training and test dataset.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123)
|
||||
|
||||
The we build the ML pipeline which includes 4 stages:
|
||||
|
||||
* Assemble all features into a single vector column.
|
||||
* From string label to indexed double label.
|
||||
* Use XGBoostClassifier to train classification model.
|
||||
* Convert indexed double label back to original string label.
|
||||
|
||||
We have shown the first three steps in the earlier sections, and the last step is finished with a new transformer `IndexToString <https://spark.apache.org/docs/2.3.1/api/scala/index.html#org.apache.spark.ml.feature.IndexToString>`_:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val labelConverter = new IndexToString()
|
||||
.setInputCol("prediction")
|
||||
.setOutputCol("realLabel")
|
||||
.setLabels(stringIndexer.labels)
|
||||
|
||||
We need to organize these steps as a Pipeline in Spark ML framework and evaluate the whole pipeline to get a PipelineModel:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.Pipeline
|
||||
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(assembler, stringIndexer, booster, labelConverter))
|
||||
val model = pipeline.fit(training)
|
||||
|
||||
After we get the PipelineModel, we can make prediction on the test dataset and evaluate the model accuracy.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
|
||||
|
||||
val prediction = model.transform(test)
|
||||
val evaluator = new MulticlassClassificationEvaluator()
|
||||
val accuracy = evaluator.evaluate(prediction)
|
||||
|
||||
Pipeline with Hyper-parameter Tunning
|
||||
=====================================
|
||||
The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process.
|
||||
|
||||
The following example shows the code snippet utilizing CrossValidation and MulticlassClassificationEvaluator
|
||||
to search the optimal combination of two XGBoost parameters, ``max_depth`` and ``eta``. (See :doc:`/parameter`.)
|
||||
The model producing the maximum accuracy defined by MulticlassClassificationEvaluator is selected and used to generate the prediction for the test set.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import org.apache.spark.ml.tuning._
|
||||
import org.apache.spark.ml.PipelineModel
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel
|
||||
|
||||
val paramGrid = new ParamGridBuilder()
|
||||
.addGrid(booster.maxDepth, Array(3, 8))
|
||||
.addGrid(booster.eta, Array(0.2, 0.6))
|
||||
.build()
|
||||
val cv = new CrossValidator()
|
||||
.setEstimator(pipeline)
|
||||
.setEvaluator(evaluator)
|
||||
.setEstimatorParamMaps(paramGrid)
|
||||
.setNumFolds(3)
|
||||
|
||||
val cvModel = cv.fit(training)
|
||||
|
||||
val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2)
|
||||
.asInstanceOf[XGBoostClassificationModel]
|
||||
bestModel.extractParamMap()
|
||||
|
||||
*********************************
|
||||
Run XGBoost4J-Spark in Production
|
||||
*********************************
|
||||
|
||||
XGBoost4J-Spark is one of the most important steps to bring XGBoost to production environment easier. In this section, we introduce three key features to run XGBoost4J-Spark in production.
|
||||
|
||||
Parallel/Distributed Training
|
||||
=============================
|
||||
The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost.
|
||||
|
||||
In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user.
|
||||
|
||||
In the code snippet where we build XGBoostClassifier, we set parameter ``num_workers`` (or ``numWorkers``).
|
||||
This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel.
|
||||
|
||||
.. note:: Regarding OpenMP optimization
|
||||
|
||||
By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved
|
||||
by running multiple workers (i.e. Spark tasks) at the same time.
|
||||
|
||||
If you do want OpenMP optimization, you have to
|
||||
|
||||
1. set ``nthread`` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor
|
||||
2. set ``spark.task.cpus`` in Spark to the same value as ``nthread``
|
||||
|
||||
Gang Scheduling
|
||||
===============
|
||||
XGBoost uses `AllReduce <http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/>`_.
|
||||
algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of ``nthread * numWorkers`` cores should be available before the training runs.
|
||||
|
||||
In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users.
|
||||
|
||||
XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
xgbClassifier.setTimeoutRequestWorkers(60000L)
|
||||
|
||||
or pass in ``timeout_request_workers`` in ``xgbParamMap`` when building XGBoostClassifier:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbParam = Map("eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "multi:softprob",
|
||||
"num_class" -> 3,
|
||||
"num_round" -> 100,
|
||||
"num_workers" -> 2,
|
||||
"timeout_request_workers" -> 60000L)
|
||||
val xgbClassifier = new XGBoostClassifier(xgbParam).
|
||||
setFeaturesCol("features").
|
||||
setLabelCol("classIndex")
|
||||
|
||||
If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case.
|
||||
|
||||
Checkpoint During Training
|
||||
==========================
|
||||
|
||||
Transient failures are also commonly seen in production environment. To simplify the design of XGBoost,
|
||||
we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great waste of resources.
|
||||
|
||||
We support creating checkpoint during training to facilitate more efficient recovery from failture. To enable this feature, you can set how many iterations we build each checkpoint with ``setCheckpointInterval`` and the location of checkpoints with ``setCheckpointPath``:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
xgbClassifier.setCheckpointInterval(2)
|
||||
xgbClassifier.setCheckpointPath("/checkpoint_path")
|
||||
|
||||
An equivalent way is to pass in parameters in XGBoostClassifier's constructor:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val xgbParam = Map("eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "multi:softprob",
|
||||
"num_class" -> 3,
|
||||
"num_round" -> 100,
|
||||
"num_workers" -> 2,
|
||||
"checkpoint_path" -> "/checkpoints_path",
|
||||
"checkpoint_interval" -> 2)
|
||||
val xgbClassifier = new XGBoostClassifier(xgbParam).
|
||||
setFeaturesCol("features").
|
||||
setLabelCol("classIndex")
|
||||
|
||||
If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in ``/checkpoints_path`` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds.
|
||||
@@ -12,10 +12,6 @@ Before running XGBoost, we must set three types of parameters: general parameter
|
||||
|
||||
In R-package, you can use ``.`` (dot) to replace underscore in the parameters, for example, you can use ``max.depth`` to indicate ``max_depth``. The underscore parameters are also valid in R.
|
||||
|
||||
.. contents::
|
||||
:backlinks: none
|
||||
:local:
|
||||
|
||||
******************
|
||||
General Parameters
|
||||
******************
|
||||
@@ -119,7 +115,7 @@ Parameters for Tree Booster
|
||||
|
||||
* ``scale_pos_weight`` [default=1]
|
||||
|
||||
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See `Parameters Tuning </tutorials/param_tuning>`_ for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
|
||||
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning </tutorials/param_tuning>` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
|
||||
|
||||
* ``updater`` [default= ``grow_colmaker,prune``]
|
||||
|
||||
@@ -176,18 +172,6 @@ Parameters for Tree Booster
|
||||
|
||||
Additional parameters for Dart Booster (``booster=dart``)
|
||||
=========================================================
|
||||
|
||||
.. note:: Using ``predict()`` with DART booster
|
||||
|
||||
If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
|
||||
some of the trees will be evaluated. This will produce incorrect results if ``data`` is
|
||||
not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
|
||||
a nonzero value, e.g.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
|
||||
* ``sample_type`` [default= ``uniform``]
|
||||
|
||||
- Type of sampling algorithm.
|
||||
@@ -228,7 +212,7 @@ Additional parameters for Dart Booster (``booster=dart``)
|
||||
- range: [0.0, 1.0]
|
||||
|
||||
Parameters for Linear Booster (``booster=gblinear``)
|
||||
====================================================
|
||||
==================================================
|
||||
* ``lambda`` [default=0, alias: ``reg_lambda``]
|
||||
|
||||
- L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples.
|
||||
@@ -264,7 +248,6 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
||||
- ``reg:logistic``: logistic regression
|
||||
- ``binary:logistic``: logistic regression for binary classification, output probability
|
||||
- ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
|
||||
- ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
- ``gpu:reg:linear``, ``gpu:reg:logistic``, ``gpu:binary:logistic``, ``gpu:binary:logitraw``: versions
|
||||
of the corresponding objective functions evaluated on the GPU; note that like the GPU histogram algorithm,
|
||||
they can only be used when the entire training session uses the same dataset
|
||||
@@ -318,10 +301,6 @@ Command Line Parameters
|
||||
***********************
|
||||
The following parameters are only used in the console version of XGBoost
|
||||
|
||||
* ``use_buffer`` [default=1]
|
||||
|
||||
- Whether to create a binary buffer from text input. Doing so normally will speed up loading times
|
||||
|
||||
* ``num_round``
|
||||
|
||||
- The number of rounds for boosting
|
||||
@@ -361,6 +340,10 @@ The following parameters are only used in the console version of XGBoost
|
||||
|
||||
- Feature map, used for dumping model
|
||||
|
||||
* ``dump_format`` [default= ``text``] options: ``text``, ``json``
|
||||
|
||||
- Format of model dump file
|
||||
|
||||
* ``name_dump`` [default= ``dump.txt``]
|
||||
|
||||
- Name of model dump file
|
||||
|
||||
@@ -2,6 +2,10 @@ Python API Reference
|
||||
====================
|
||||
This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
|
||||
|
||||
.. contents::
|
||||
:backlinks: none
|
||||
:local:
|
||||
|
||||
Core Data Structure
|
||||
-------------------
|
||||
.. automodule:: xgboost.core
|
||||
@@ -29,9 +33,11 @@ Scikit-Learn API
|
||||
.. automodule:: xgboost.sklearn
|
||||
.. autoclass:: xgboost.XGBRegressor
|
||||
:members:
|
||||
:inherited-members:
|
||||
:show-inheritance:
|
||||
.. autoclass:: xgboost.XGBClassifier
|
||||
:members:
|
||||
:inherited-members:
|
||||
:show-inheritance:
|
||||
|
||||
Plotting API
|
||||
|
||||
@@ -25,8 +25,7 @@ The XGBoost python module is able to load data from:
|
||||
- LibSVM text format file
|
||||
- Comma-separated values (CSV) file
|
||||
- NumPy 2D array
|
||||
- SciPy 2D sparse array
|
||||
- Pandas data frame, and
|
||||
- SciPy 2D sparse array, and
|
||||
- XGBoost binary buffer file.
|
||||
|
||||
(See :doc:`/tutorials/input_format` for detailed description of text input format.)
|
||||
@@ -67,14 +66,6 @@ The data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object.
|
||||
csr = scipy.sparse.csr_matrix((dat, (row, col)))
|
||||
dtrain = xgb.DMatrix(csr)
|
||||
|
||||
* To load a Pandas data frame into :py:class:`DMatrix <xgboost.DMatrix>`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
data = pandas.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
|
||||
label = pandas.DataFrame(np.random.randint(2, size=4))
|
||||
dtrain = xgb.DMatrix(data, label=label)
|
||||
|
||||
* Saving :py:class:`DMatrix <xgboost.DMatrix>` into a XGBoost binary file will make loading faster:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -5,9 +5,9 @@ This is a step-by-step tutorial on how to setup and run distributed `XGBoost <ht
|
||||
on an AWS EC2 cluster. Distributed XGBoost runs on various platforms such as MPI, SGE and Hadoop YARN.
|
||||
In this tutorial, we use YARN as an example since this is a widely used solution for distributed computing.
|
||||
|
||||
.. note:: XGBoost with Spark
|
||||
.. note:: XGBoost on Spark
|
||||
|
||||
If you are preprocessing training data with Spark, consider using :doc:`XGBoost4J-Spark </jvm/xgboost4j_spark_tutorial>`.
|
||||
If you are preprocessing training data with Spark, you may want to look at `XGBoost4J-Spark <https://xgboost.ai/2016/10/26/a-full-integration-of-xgboost-and-spark.html>`_, which supports distributed training on Resilient Distributed Dataset (RDD).
|
||||
|
||||
************
|
||||
Prerequisite
|
||||
|
||||
@@ -111,9 +111,3 @@ Sample Script
|
||||
# make prediction
|
||||
# ntree_limit must not be 0
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
|
||||
.. note:: Specify ``ntree_limit`` when predicting with test sets
|
||||
|
||||
By default, ``bst.predict()`` will perform dropouts on trees. To obtain
|
||||
correct results on test sets, disable dropouts by specifying
|
||||
a nonzero value for ``ntree_limit``.
|
||||
|
||||
@@ -13,6 +13,10 @@ The external memory version takes in the following filename format:
|
||||
The ``filename`` is the normal path to libsvm file you want to load in, and ``cacheprefix`` is a
|
||||
path to a cache file that XGBoost will use for external memory cache.
|
||||
|
||||
.. note:: External memory is not available with GPU algorithms
|
||||
|
||||
External memory is not available when ``tree_method`` is set to ``gpu_exact`` or ``gpu_hist``.
|
||||
|
||||
The following code was extracted from `demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -10,8 +10,7 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
|
||||
:caption: Contents:
|
||||
|
||||
model
|
||||
Distributed XGBoost with AWS YARN <aws_yarn>
|
||||
Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
|
||||
aws_yarn
|
||||
dart
|
||||
monotonic
|
||||
input_format
|
||||
|
||||
@@ -223,7 +223,7 @@ In this equation, :math:`w_j` are independent with respect to each other, the fo
|
||||
w_j^\ast &= -\frac{G_j}{H_j+\lambda}\\
|
||||
\text{obj}^\ast &= -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
|
||||
|
||||
The last equation measures *how good* a tree structure :math:`$q(x)` is.
|
||||
The last equation measures *how good* a tree structure :math:`q(x)` is.
|
||||
|
||||
.. image:: https://raw.githubusercontent.com/dmlc/web-data/master/xgboost/model/struct_score.png
|
||||
:width: 100%
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
/*!
|
||||
* \brief Tag function as usable by device
|
||||
*/
|
||||
#if defined (__CUDA__) || defined(__NVCC__)
|
||||
#ifdef __NVCC__
|
||||
#define XGBOOST_DEVICE __host__ __device__
|
||||
#else
|
||||
#define XGBOOST_DEVICE
|
||||
|
||||
@@ -96,15 +96,6 @@ XGB_EXTERN_C typedef int XGBCallbackDataIterNext( // NOLINT(*)
|
||||
*/
|
||||
XGB_DLL const char *XGBGetLastError(void);
|
||||
|
||||
/*!
|
||||
* \brief register callback function for LOG(INFO) messages -- helpful messages
|
||||
* that are not errors.
|
||||
* Note: this function can be called by multiple threads. The callback function
|
||||
* will run on the thread that registered it
|
||||
* \return 0 for success, -1 for failure
|
||||
*/
|
||||
XGB_DLL int XGBRegisterLogCallback(void (*callback)(const char*));
|
||||
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param fname the name of the file
|
||||
@@ -228,22 +219,6 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data, // NOLINT
|
||||
bst_ulong nrow, bst_ulong ncol,
|
||||
float missing, DMatrixHandle *out,
|
||||
int nthread);
|
||||
/*!
|
||||
* \brief create matrix content from python data table
|
||||
* \param data pointer to pointer to column data
|
||||
* \param feature_stypes pointer to strings
|
||||
* \param nrow number of rows
|
||||
* \param ncol number columns
|
||||
* \param out created dmatrix
|
||||
* \param nthread number of threads (up to maximum cores available, if <=0 use all cores)
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGDMatrixCreateFromDT(void** data,
|
||||
const char ** feature_stypes,
|
||||
bst_ulong nrow,
|
||||
bst_ulong ncol,
|
||||
DMatrixHandle* out,
|
||||
int nthread);
|
||||
/*!
|
||||
* \brief create a new dmatrix from sliced content of existing matrix
|
||||
* \param handle instance of data matrix to be sliced
|
||||
@@ -286,7 +261,7 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
|
||||
* \brief set uint32 vector to a content in info
|
||||
* \param handle a instance of data matrix
|
||||
* \param field field name
|
||||
* \param array pointer to unsigned int vector
|
||||
* \param array pointer to float vector
|
||||
* \param len length of array
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
|
||||
@@ -9,11 +9,10 @@
|
||||
|
||||
#include <dmlc/base.h>
|
||||
#include <dmlc/data.h>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include "./base.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -53,8 +52,6 @@ class MetaInfo {
|
||||
std::vector<bst_uint> group_ptr_;
|
||||
/*! \brief weights of each instance, optional */
|
||||
std::vector<bst_float> weights_;
|
||||
/*! \brief session-id of each instance, optional */
|
||||
std::vector<uint64_t> qids_;
|
||||
/*!
|
||||
* \brief initialized margins,
|
||||
* if specified, xgboost will start from this init margin
|
||||
@@ -62,9 +59,7 @@ class MetaInfo {
|
||||
*/
|
||||
std::vector<bst_float> base_margin_;
|
||||
/*! \brief version flag, used to check version of this info */
|
||||
static const int kVersion = 2;
|
||||
/*! \brief version that introduced qid field */
|
||||
static const int kVersionQidAdded = 2;
|
||||
static const int kVersion = 1;
|
||||
/*! \brief default constructor */
|
||||
MetaInfo() = default;
|
||||
/*!
|
||||
@@ -122,39 +117,28 @@ class MetaInfo {
|
||||
mutable std::vector<size_t> label_order_cache_;
|
||||
};
|
||||
|
||||
/*! \brief Element from a sparse vector */
|
||||
struct Entry {
|
||||
/*! \brief feature index */
|
||||
bst_uint index;
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
/*! \brief default constructor */
|
||||
Entry() = default;
|
||||
/*!
|
||||
* \brief constructor with index and value
|
||||
* \param index The feature or row index.
|
||||
* \param fvalue THe feature value.
|
||||
*/
|
||||
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||
/*! \brief reversely compare feature values */
|
||||
inline static bool CmpValue(const Entry& a, const Entry& b) {
|
||||
return a.fvalue < b.fvalue;
|
||||
}
|
||||
inline bool operator==(const Entry& other) const {
|
||||
return (this->index == other.index && this->fvalue == other.fvalue);
|
||||
}
|
||||
};
|
||||
/*! \brief read-only sparse instance batch in CSR format */
|
||||
struct SparseBatch {
|
||||
/*! \brief an entry of sparse vector */
|
||||
struct Entry {
|
||||
/*! \brief feature index */
|
||||
bst_uint index;
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
/*! \brief default constructor */
|
||||
Entry() = default;
|
||||
/*!
|
||||
* \brief constructor with index and value
|
||||
* \param index The feature or row index.
|
||||
* \param fvalue THe feature value.
|
||||
*/
|
||||
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||
/*! \brief reversely compare feature values */
|
||||
inline static bool CmpValue(const Entry& a, const Entry& b) {
|
||||
return a.fvalue < b.fvalue;
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief in-memory storage unit of sparse batch
|
||||
*/
|
||||
class SparsePage {
|
||||
public:
|
||||
std::vector<size_t> offset;
|
||||
/*! \brief the data of the segments */
|
||||
std::vector<Entry> data;
|
||||
|
||||
size_t base_rowid;
|
||||
/*! \brief an instance of sparse vector in the batch */
|
||||
struct Inst {
|
||||
/*! \brief pointer to the elements*/
|
||||
@@ -170,83 +154,38 @@ class SparsePage {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return {data.data() + offset[i], static_cast<bst_uint>(offset[i + 1] - offset[i])};
|
||||
}
|
||||
|
||||
/*! \brief constructor */
|
||||
SparsePage() {
|
||||
this->Clear();
|
||||
}
|
||||
/*! \return number of instance in the page */
|
||||
inline size_t Size() const {
|
||||
return offset.size() - 1;
|
||||
}
|
||||
/*! \return estimation of memory cost of this page */
|
||||
inline size_t MemCostBytes() const {
|
||||
return offset.size() * sizeof(size_t) + data.size() * sizeof(Entry);
|
||||
}
|
||||
/*! \brief clear the page */
|
||||
inline void Clear() {
|
||||
base_rowid = 0;
|
||||
offset.clear();
|
||||
offset.push_back(0);
|
||||
data.clear();
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Push row block into the page.
|
||||
* \param batch the row batch.
|
||||
*/
|
||||
inline void Push(const dmlc::RowBlock<uint32_t>& batch) {
|
||||
data.reserve(data.size() + batch.offset[batch.size] - batch.offset[0]);
|
||||
offset.reserve(offset.size() + batch.size);
|
||||
CHECK(batch.index != nullptr);
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
offset.push_back(offset.back() + batch.offset[i + 1] - batch.offset[i]);
|
||||
}
|
||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||
uint32_t index = batch.index[i];
|
||||
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
|
||||
data.emplace_back(index, fvalue);
|
||||
}
|
||||
CHECK_EQ(offset.back(), data.size());
|
||||
}
|
||||
/*!
|
||||
* \brief Push a sparse page
|
||||
* \param batch the row page
|
||||
*/
|
||||
inline void Push(const SparsePage &batch) {
|
||||
size_t top = offset.back();
|
||||
data.resize(top + batch.data.size());
|
||||
std::memcpy(dmlc::BeginPtr(data) + top,
|
||||
dmlc::BeginPtr(batch.data),
|
||||
sizeof(Entry) * batch.data.size());
|
||||
size_t begin = offset.size();
|
||||
offset.resize(begin + batch.Size());
|
||||
for (size_t i = 0; i < batch.Size(); ++i) {
|
||||
offset[i + begin] = top + batch.offset[i + 1];
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief Push one instance into page
|
||||
* \param inst an instance row
|
||||
*/
|
||||
inline void Push(const Inst &inst) {
|
||||
offset.push_back(offset.back() + inst.length);
|
||||
size_t begin = data.size();
|
||||
data.resize(begin + inst.length);
|
||||
if (inst.length != 0) {
|
||||
std::memcpy(dmlc::BeginPtr(data) + begin, inst.data,
|
||||
sizeof(Entry) * inst.length);
|
||||
}
|
||||
}
|
||||
|
||||
size_t Size() { return offset.size() - 1; }
|
||||
/*! \brief batch size */
|
||||
size_t size;
|
||||
};
|
||||
|
||||
/*! \brief read-only row batch, used to access row continuously */
|
||||
struct RowBatch : public SparseBatch {
|
||||
/*! \brief the offset of rowid of this batch */
|
||||
size_t base_rowid;
|
||||
/*! \brief array[size+1], row pointer of each of the elements */
|
||||
const size_t *ind_ptr;
|
||||
/*! \brief array[ind_ptr.back()], content of the sparse element */
|
||||
const Entry *data_ptr;
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return {data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i + 1] - ind_ptr[i])};
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief read-only column batch, used to access columns,
|
||||
* the columns are not required to be continuous
|
||||
*/
|
||||
struct ColBatch : public SparseBatch {
|
||||
/*! \brief column index of each columns in the data */
|
||||
const bst_uint *col_index;
|
||||
/*! \brief pointer to the column data */
|
||||
const Inst *col_data;
|
||||
/*! \brief get i-th column from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return col_data[i];
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief This is data structure that user can pass to DMatrix::Create
|
||||
@@ -255,7 +194,7 @@ class SparsePage {
|
||||
*
|
||||
* On distributed setting, usually an customized dmlc::Parser is needed instead.
|
||||
*/
|
||||
class DataSource : public dmlc::DataIter<SparsePage> {
|
||||
class DataSource : public dmlc::DataIter<RowBatch> {
|
||||
public:
|
||||
/*!
|
||||
* \brief Meta information about the dataset
|
||||
@@ -321,17 +260,28 @@ class DMatrix {
|
||||
* \brief get the row iterator, reset to beginning position
|
||||
* \note Only either RowIterator or column Iterator can be active.
|
||||
*/
|
||||
virtual dmlc::DataIter<SparsePage>* RowIterator() = 0;
|
||||
virtual dmlc::DataIter<RowBatch>* RowIterator() = 0;
|
||||
/*!\brief get column iterator, reset to the beginning position */
|
||||
virtual dmlc::DataIter<SparsePage>* ColIterator() = 0;
|
||||
virtual dmlc::DataIter<ColBatch>* ColIterator() = 0;
|
||||
/*!
|
||||
* \brief get the column iterator associated with subset of column features.
|
||||
* \param fset is the list of column index set that must be contained in the returning Column iterator
|
||||
* \return the column iterator, initialized so that it reads the elements in fset
|
||||
*/
|
||||
virtual dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) = 0;
|
||||
/*!
|
||||
* \brief check if column access is supported, if not, initialize column access.
|
||||
* \param enabled whether certain feature should be included in column access.
|
||||
* \param subsample subsample ratio when generating column access.
|
||||
* \param max_row_perbatch auxiliary information, maximum row used in each column batch.
|
||||
* this is a hint information that can be ignored by the implementation.
|
||||
* \param sorted If column features should be in sorted order
|
||||
* \return Number of column blocks in the column access.
|
||||
*/
|
||||
virtual void InitColAccess(size_t max_row_perbatch, bool sorted) = 0;
|
||||
|
||||
virtual void InitColAccess(const std::vector<bool>& enabled,
|
||||
float subsample,
|
||||
size_t max_row_perbatch, bool sorted) = 0;
|
||||
// the following are column meta data, should be able to answer them fast.
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(bool sorted) const = 0;
|
||||
@@ -438,7 +388,7 @@ inline bool RowSet::Load(dmlc::Stream* fi) {
|
||||
} // namespace xgboost
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_DECLARE_TRAITS(is_pod, xgboost::Entry, true);
|
||||
DMLC_DECLARE_TRAITS(is_pod, xgboost::SparseBatch::Entry, true);
|
||||
DMLC_DECLARE_TRAITS(has_saveload, xgboost::RowSet, true);
|
||||
}
|
||||
#endif // XGBOOST_DATA_H_
|
||||
|
||||
@@ -94,7 +94,7 @@ class GradientBooster {
|
||||
* \param root_index the root index
|
||||
* \sa Predict
|
||||
*/
|
||||
virtual void PredictInstance(const SparsePage::Inst& inst,
|
||||
virtual void PredictInstance(const SparseBatch::Inst& inst,
|
||||
std::vector<bst_float>* out_preds,
|
||||
unsigned ntree_limit = 0,
|
||||
unsigned root_index = 0) = 0;
|
||||
|
||||
@@ -167,7 +167,7 @@ class Learner : public rabit::Serializable {
|
||||
* \param out_preds output vector to hold the predictions
|
||||
* \param ntree_limit limit the number of trees used in prediction
|
||||
*/
|
||||
inline void Predict(const SparsePage::Inst &inst,
|
||||
inline void Predict(const SparseBatch::Inst &inst,
|
||||
bool output_margin,
|
||||
HostDeviceVector<bst_float> *out_preds,
|
||||
unsigned ntree_limit = 0) const;
|
||||
@@ -190,7 +190,7 @@ class Learner : public rabit::Serializable {
|
||||
};
|
||||
|
||||
// implementation of inline functions.
|
||||
inline void Learner::Predict(const SparsePage::Inst& inst,
|
||||
inline void Learner::Predict(const SparseBatch::Inst& inst,
|
||||
bool output_margin,
|
||||
HostDeviceVector<bst_float>* out_preds,
|
||||
unsigned ntree_limit) const {
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#define XGBOOST_LOGGING_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/thread_local.h>
|
||||
#include <sstream>
|
||||
#include "./base.h"
|
||||
|
||||
@@ -38,23 +37,6 @@ class TrackerLogger : public BaseLogger {
|
||||
~TrackerLogger();
|
||||
};
|
||||
|
||||
class LogCallbackRegistry {
|
||||
public:
|
||||
using Callback = void (*)(const char*);
|
||||
LogCallbackRegistry()
|
||||
: log_callback_([] (const char* msg) { std::cerr << msg << std::endl; }) {}
|
||||
inline void Register(Callback log_callback) {
|
||||
this->log_callback_ = log_callback;
|
||||
}
|
||||
inline Callback Get() const {
|
||||
return log_callback_;
|
||||
}
|
||||
private:
|
||||
Callback log_callback_;
|
||||
};
|
||||
|
||||
using LogCallbackRegistryStore = dmlc::ThreadLocalStore<LogCallbackRegistry>;
|
||||
|
||||
// redefines the logging macro if not existed
|
||||
#ifndef LOG
|
||||
#define LOG(severity) LOG_##severity.stream()
|
||||
|
||||
@@ -88,7 +88,7 @@ class Predictor {
|
||||
int num_new_trees) = 0;
|
||||
|
||||
/**
|
||||
* \fn virtual void Predictor::PredictInstance( const SparsePage::Inst&
|
||||
* \fn virtual void Predictor::PredictInstance( const SparseBatch::Inst&
|
||||
* inst, std::vector<bst_float>* out_preds, const gbm::GBTreeModel& model,
|
||||
* unsigned ntree_limit = 0, unsigned root_index = 0) = 0;
|
||||
*
|
||||
@@ -104,7 +104,7 @@ class Predictor {
|
||||
* \param root_index (Optional) Zero-based index of the root.
|
||||
*/
|
||||
|
||||
virtual void PredictInstance(const SparsePage::Inst& inst,
|
||||
virtual void PredictInstance(const SparseBatch::Inst& inst,
|
||||
std::vector<bst_float>* out_preds,
|
||||
const gbm::GBTreeModel& model,
|
||||
unsigned ntree_limit = 0,
|
||||
|
||||
@@ -447,12 +447,12 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat> {
|
||||
* \brief fill the vector with sparse vector
|
||||
* \param inst The sparse instance to fill.
|
||||
*/
|
||||
inline void Fill(const SparsePage::Inst& inst);
|
||||
inline void Fill(const RowBatch::Inst& inst);
|
||||
/*!
|
||||
* \brief drop the trace after fill, must be called after fill.
|
||||
* \param inst The sparse instance to drop.
|
||||
*/
|
||||
inline void Drop(const SparsePage::Inst& inst);
|
||||
inline void Drop(const RowBatch::Inst& inst);
|
||||
/*!
|
||||
* \brief returns the size of the feature vector
|
||||
* \return the size of the feature vector
|
||||
@@ -573,14 +573,14 @@ inline void RegTree::FVec::Init(size_t size) {
|
||||
std::fill(data_.begin(), data_.end(), e);
|
||||
}
|
||||
|
||||
inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
|
||||
inline void RegTree::FVec::Fill(const RowBatch::Inst& inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
if (inst[i].index >= data_.size()) continue;
|
||||
data_[inst[i].index].fvalue = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
|
||||
inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
|
||||
inline void RegTree::FVec::Drop(const RowBatch::Inst& inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
if (inst[i].index >= data_.size()) continue;
|
||||
data_[inst[i].index].flag = -1;
|
||||
|
||||
@@ -20,27 +20,6 @@ You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.o
|
||||
|
||||
XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5
|
||||
|
||||
### Access release version
|
||||
|
||||
<b>maven</b>
|
||||
|
||||
```
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j</artifactId>
|
||||
<version>latest_version_num</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
<b>sbt</b>
|
||||
```sbt
|
||||
"ml.dmlc" % "xgboost4j" % "latest_version_num"
|
||||
```
|
||||
|
||||
For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
|
||||
|
||||
if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
|
||||
|
||||
### Access SNAPSHOT version
|
||||
|
||||
You need to add github as repo:
|
||||
@@ -78,7 +57,7 @@ the add dependency as following:
|
||||
"ml.dmlc" % "xgboost4j" % "latest_version_num"
|
||||
```
|
||||
|
||||
For the latest release version number, please check [here](https://github.com/CodingCat/xgboost/tree/maven-repo/ml/dmlc/xgboost4j).
|
||||
For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
|
||||
|
||||
if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>XGBoost JVM Package</name>
|
||||
<description>JVM Package for XGBoost</description>
|
||||
@@ -23,6 +23,11 @@
|
||||
<email>codingcat@apache.org</email>
|
||||
</developer>
|
||||
</developers>
|
||||
<scm>
|
||||
<connection>scm:git:git:/github.com/dmlc/xgboost.git</connection>
|
||||
<developerConnection>scm:git:ssh://github.com/dmlc/xgboost.git</developerConnection>
|
||||
<url>https://github.com/dmlc/xgboost</url>
|
||||
</scm>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
@@ -118,6 +123,24 @@
|
||||
<autoReleaseAfterClose>false</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9.1</version>
|
||||
<configuration>
|
||||
<excludePackageNames>
|
||||
ml.dmlc.xgboost4j.java.example
|
||||
</excludePackageNames>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
@@ -208,6 +231,19 @@
|
||||
</distributionManagement>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.scala-tools</groupId>
|
||||
<artifactId>maven-scala-plugin</artifactId>
|
||||
<version>2.15.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.scalastyle</groupId>
|
||||
<artifactId>scalastyle-maven-plugin</artifactId>
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-example</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
<packaging>jar</packaging>
|
||||
<build>
|
||||
<plugins>
|
||||
@@ -26,7 +26,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-spark</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
@@ -37,7 +37,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-flink</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.tuning._
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
|
||||
|
||||
// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
|
||||
|
||||
object SparkMLlibPipeline {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
if (args.length != 1) {
|
||||
println("Usage: SparkMLlibPipeline input_path native_model_path pipeline_model_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
|
||||
val inputPath = args(0)
|
||||
val nativeModelPath = args(1)
|
||||
val pipelineModelPath = args(2)
|
||||
|
||||
val spark = SparkSession
|
||||
.builder()
|
||||
.appName("XGBoost4J-Spark Pipeline Example")
|
||||
.getOrCreate()
|
||||
|
||||
// Load dataset
|
||||
val schema = new StructType(Array(
|
||||
StructField("sepal length", DoubleType, true),
|
||||
StructField("sepal width", DoubleType, true),
|
||||
StructField("petal length", DoubleType, true),
|
||||
StructField("petal width", DoubleType, true),
|
||||
StructField("class", StringType, true)))
|
||||
|
||||
val rawInput = spark.read.schema(schema).csv(inputPath)
|
||||
|
||||
// Split training and test dataset
|
||||
val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123)
|
||||
|
||||
// Build ML pipeline, it includes 4 stages:
|
||||
// 1, Assemble all features into a single vector column.
|
||||
// 2, From string label to indexed double label.
|
||||
// 3, Use XGBoostClassifier to train classification model.
|
||||
// 4, Convert indexed double label back to original string label.
|
||||
val assembler = new VectorAssembler()
|
||||
.setInputCols(Array("sepal length", "sepal width", "petal length", "petal width"))
|
||||
.setOutputCol("features")
|
||||
val labelIndexer = new StringIndexer()
|
||||
.setInputCol("class")
|
||||
.setOutputCol("classIndex")
|
||||
.fit(training)
|
||||
val booster = new XGBoostClassifier(
|
||||
Map("eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "multi:softprob",
|
||||
"num_class" -> 3,
|
||||
"num_round" -> 100,
|
||||
"num_workers" -> 2
|
||||
)
|
||||
)
|
||||
val labelConverter = new IndexToString()
|
||||
.setInputCol("prediction")
|
||||
.setOutputCol("realLabel")
|
||||
.setLabels(labelIndexer.labels)
|
||||
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(assembler, labelIndexer, booster, labelConverter))
|
||||
val model = pipeline.fit(training)
|
||||
|
||||
// Batch prediction
|
||||
val prediction = model.transform(test)
|
||||
prediction.show(false)
|
||||
|
||||
// Model evaluation
|
||||
val evaluator = new MulticlassClassificationEvaluator()
|
||||
val accuracy = evaluator.evaluate(prediction)
|
||||
println("The model accuracy is : " + accuracy)
|
||||
|
||||
// Tune model using cross validation
|
||||
val paramGrid = new ParamGridBuilder()
|
||||
.addGrid(booster.maxDepth, Array(3, 8))
|
||||
.addGrid(booster.eta, Array(0.2, 0.6))
|
||||
.build()
|
||||
val cv = new CrossValidator()
|
||||
.setEstimator(pipeline)
|
||||
.setEvaluator(evaluator)
|
||||
.setEstimatorParamMaps(paramGrid)
|
||||
.setNumFolds(3)
|
||||
|
||||
val cvModel = cv.fit(training)
|
||||
|
||||
val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2)
|
||||
.asInstanceOf[XGBoostClassificationModel]
|
||||
println("The params of best XGBoostClassification model : " +
|
||||
bestModel.extractParamMap())
|
||||
println("The training summary of best XGBoostClassificationModel : " +
|
||||
bestModel.summary)
|
||||
|
||||
// Export the XGBoostClassificationModel as local XGBoost model,
|
||||
// then you can load it back in local Python environment.
|
||||
bestModel.nativeBooster.saveModel(nativeModelPath)
|
||||
|
||||
// ML pipeline persistence
|
||||
model.write.overwrite().save(pipelineModelPath)
|
||||
|
||||
// Load a saved model and serving
|
||||
val model2 = PipelineModel.load(pipelineModelPath)
|
||||
model2.transform(test).show(false)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.io.Source
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.{XGBoostEstimator, XGBoost}
|
||||
import org.apache.spark.ml.Pipeline
|
||||
import org.apache.spark.ml.evaluation.RegressionEvaluator
|
||||
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
|
||||
import org.apache.spark.ml.tuning._
|
||||
import org.apache.spark.sql.{Dataset, DataFrame, SparkSession}
|
||||
|
||||
case class SalesRecord(storeId: Int, daysOfWeek: Int, date: String, sales: Int, customers: Int,
|
||||
open: Int, promo: Int, stateHoliday: String, schoolHoliday: String)
|
||||
|
||||
case class Store(storeId: Int, storeType: String, assortment: String, competitionDistance: Int,
|
||||
competitionOpenSinceMonth: Int, competitionOpenSinceYear: Int, promo2: Int,
|
||||
promo2SinceWeek: Int, promo2SinceYear: Int, promoInterval: String)
|
||||
|
||||
object SparkModelTuningTool {
|
||||
|
||||
private def parseStoreFile(storeFilePath: String): List[Store] = {
|
||||
var isHeader = true
|
||||
val storeInstances = new ListBuffer[Store]
|
||||
for (line <- Source.fromFile(storeFilePath).getLines()) {
|
||||
if (isHeader) {
|
||||
isHeader = false
|
||||
} else {
|
||||
try {
|
||||
val strArray = line.split(",")
|
||||
if (strArray.length == 10) {
|
||||
val Array(storeIdStr, storeTypeStr, assortmentStr, competitionDistanceStr,
|
||||
competitionOpenSinceMonthStr, competitionOpenSinceYearStr, promo2Str,
|
||||
promo2SinceWeekStr, promo2SinceYearStr, promoIntervalStr) = line.split(",")
|
||||
storeInstances += Store(storeIdStr.toInt, storeTypeStr, assortmentStr,
|
||||
if (competitionDistanceStr == "") -1 else competitionDistanceStr.toInt,
|
||||
if (competitionOpenSinceMonthStr == "" ) -1 else competitionOpenSinceMonthStr.toInt,
|
||||
if (competitionOpenSinceYearStr == "" ) -1 else competitionOpenSinceYearStr.toInt,
|
||||
promo2Str.toInt,
|
||||
if (promo2Str == "0") -1 else promo2SinceWeekStr.toInt,
|
||||
if (promo2Str == "0") -1 else promo2SinceYearStr.toInt,
|
||||
promoIntervalStr.replace("\"", ""))
|
||||
} else {
|
||||
val Array(storeIdStr, storeTypeStr, assortmentStr, competitionDistanceStr,
|
||||
competitionOpenSinceMonthStr, competitionOpenSinceYearStr, promo2Str,
|
||||
promo2SinceWeekStr, promo2SinceYearStr, firstMonth, secondMonth, thirdMonth,
|
||||
forthMonth) = line.split(",")
|
||||
storeInstances += Store(storeIdStr.toInt, storeTypeStr, assortmentStr,
|
||||
if (competitionDistanceStr == "") -1 else competitionDistanceStr.toInt,
|
||||
if (competitionOpenSinceMonthStr == "" ) -1 else competitionOpenSinceMonthStr.toInt,
|
||||
if (competitionOpenSinceYearStr == "" ) -1 else competitionOpenSinceYearStr.toInt,
|
||||
promo2Str.toInt,
|
||||
if (promo2Str == "0") -1 else promo2SinceWeekStr.toInt,
|
||||
if (promo2Str == "0") -1 else promo2SinceYearStr.toInt,
|
||||
firstMonth.replace("\"", "") + "," + secondMonth + "," + thirdMonth + "," +
|
||||
forthMonth.replace("\"", ""))
|
||||
}
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
storeInstances.toList
|
||||
}
|
||||
|
||||
private def parseTrainingFile(trainingPath: String): List[SalesRecord] = {
|
||||
var isHeader = true
|
||||
val records = new ListBuffer[SalesRecord]
|
||||
for (line <- Source.fromFile(trainingPath).getLines()) {
|
||||
if (isHeader) {
|
||||
isHeader = false
|
||||
} else {
|
||||
val Array(storeIdStr, daysOfWeekStr, dateStr, salesStr, customerStr, openStr, promoStr,
|
||||
stateHolidayStr, schoolHolidayStr) = line.split(",")
|
||||
val salesRecord = SalesRecord(storeIdStr.toInt, daysOfWeekStr.toInt, dateStr,
|
||||
salesStr.toInt, customerStr.toInt, openStr.toInt, promoStr.toInt, stateHolidayStr,
|
||||
schoolHolidayStr)
|
||||
records += salesRecord
|
||||
}
|
||||
}
|
||||
records.toList
|
||||
}
|
||||
|
||||
private def featureEngineering(ds: DataFrame): DataFrame = {
|
||||
import org.apache.spark.sql.functions._
|
||||
import ds.sparkSession.implicits._
|
||||
val stateHolidayIndexer = new StringIndexer()
|
||||
.setInputCol("stateHoliday")
|
||||
.setOutputCol("stateHolidayIndex")
|
||||
val schoolHolidayIndexer = new StringIndexer()
|
||||
.setInputCol("schoolHoliday")
|
||||
.setOutputCol("schoolHolidayIndex")
|
||||
val storeTypeIndexer = new StringIndexer()
|
||||
.setInputCol("storeType")
|
||||
.setOutputCol("storeTypeIndex")
|
||||
val assortmentIndexer = new StringIndexer()
|
||||
.setInputCol("assortment")
|
||||
.setOutputCol("assortmentIndex")
|
||||
val promoInterval = new StringIndexer()
|
||||
.setInputCol("promoInterval")
|
||||
.setOutputCol("promoIntervalIndex")
|
||||
val filteredDS = ds.filter($"sales" > 0).filter($"open" > 0)
|
||||
// parse date
|
||||
val dsWithDayCol =
|
||||
filteredDS.withColumn("day", udf((dateStr: String) =>
|
||||
dateStr.split("-")(2).toInt).apply(col("date")))
|
||||
val dsWithMonthCol =
|
||||
dsWithDayCol.withColumn("month", udf((dateStr: String) =>
|
||||
dateStr.split("-")(1).toInt).apply(col("date")))
|
||||
val dsWithYearCol =
|
||||
dsWithMonthCol.withColumn("year", udf((dateStr: String) =>
|
||||
dateStr.split("-")(0).toInt).apply(col("date")))
|
||||
val dsWithLogSales = dsWithYearCol.withColumn("logSales",
|
||||
udf((sales: Int) => math.log(sales)).apply(col("sales")))
|
||||
|
||||
// fill with mean values
|
||||
val meanCompetitionDistance = dsWithLogSales.select(avg("competitionDistance")).first()(0).
|
||||
asInstanceOf[Double]
|
||||
println("====" + meanCompetitionDistance)
|
||||
val finalDS = dsWithLogSales.withColumn("transformedCompetitionDistance",
|
||||
udf((distance: Int) => if (distance > 0) distance.toDouble else meanCompetitionDistance).
|
||||
apply(col("competitionDistance")))
|
||||
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setInputCols(Array("storeId", "daysOfWeek", "promo", "competitionDistance", "promo2", "day",
|
||||
"month", "year", "transformedCompetitionDistance", "stateHolidayIndex",
|
||||
"schoolHolidayIndex", "storeTypeIndex", "assortmentIndex", "promoIntervalIndex"))
|
||||
.setOutputCol("features")
|
||||
|
||||
val pipeline = new Pipeline().setStages(
|
||||
Array(stateHolidayIndexer, schoolHolidayIndexer, storeTypeIndexer, assortmentIndexer,
|
||||
promoInterval, vectorAssembler))
|
||||
|
||||
pipeline.fit(finalDS).transform(finalDS).
|
||||
drop("stateHoliday", "schoolHoliday", "storeType", "assortment", "promoInterval", "sales",
|
||||
"promo2SinceWeek", "customers", "promoInterval", "competitionOpenSinceYear",
|
||||
"competitionOpenSinceMonth", "promo2SinceYear", "competitionDistance", "date")
|
||||
}
|
||||
|
||||
private def crossValidation(
|
||||
xgboostParam: Map[String, Any],
|
||||
trainingData: Dataset[_]): TrainValidationSplitModel = {
|
||||
val xgbEstimator = new XGBoostEstimator(xgboostParam).setFeaturesCol("features").
|
||||
setLabelCol("logSales")
|
||||
val paramGrid = new ParamGridBuilder()
|
||||
.addGrid(xgbEstimator.round, Array(20, 50))
|
||||
.addGrid(xgbEstimator.eta, Array(0.1, 0.4))
|
||||
.build()
|
||||
val tv = new TrainValidationSplit()
|
||||
.setEstimator(xgbEstimator)
|
||||
.setEvaluator(new RegressionEvaluator().setLabelCol("logSales"))
|
||||
.setEstimatorParamMaps(paramGrid)
|
||||
.setTrainRatio(0.8) // Use 3+ in practice
|
||||
tv.fit(trainingData)
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val sparkSession = SparkSession.builder().appName("rosseman").getOrCreate()
|
||||
import sparkSession.implicits._
|
||||
|
||||
// parse training file to data frame
|
||||
val trainingPath = args(0)
|
||||
val allSalesRecords = parseTrainingFile(trainingPath)
|
||||
// create dataset
|
||||
val salesRecordsDF = allSalesRecords.toDF
|
||||
|
||||
// parse store file to data frame
|
||||
val storeFilePath = args(1)
|
||||
val allStores = parseStoreFile(storeFilePath)
|
||||
val storesDS = allStores.toDF()
|
||||
|
||||
val fullDataset = salesRecordsDF.join(storesDS, "storeId")
|
||||
val featureEngineeredDF = featureEngineering(fullDataset)
|
||||
// prediction
|
||||
val params = new mutable.HashMap[String, Any]()
|
||||
params += "eta" -> 0.1
|
||||
params += "max_depth" -> 6
|
||||
params += "silent" -> 1
|
||||
params += "ntreelimit" -> 1000
|
||||
params += "objective" -> "reg:linear"
|
||||
params += "subsample" -> 0.8
|
||||
params += "num_round" -> 100
|
||||
|
||||
val bestModel = crossValidation(params.toMap, featureEngineeredDF)
|
||||
}
|
||||
}
|
||||
@@ -1,78 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
|
||||
|
||||
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
|
||||
|
||||
// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
|
||||
object SparkTraining {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length < 1) {
|
||||
// scalastyle:off
|
||||
println("Usage: program input_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
|
||||
val spark = SparkSession.builder().getOrCreate()
|
||||
val inputPath = args(0)
|
||||
val schema = new StructType(Array(
|
||||
StructField("sepal length", DoubleType, true),
|
||||
StructField("sepal width", DoubleType, true),
|
||||
StructField("petal length", DoubleType, true),
|
||||
StructField("petal width", DoubleType, true),
|
||||
StructField("class", StringType, true)))
|
||||
val rawInput = spark.read.schema(schema).csv(args(0))
|
||||
|
||||
// transform class to index to make xgboost happy
|
||||
val stringIndexer = new StringIndexer()
|
||||
.setInputCol("class")
|
||||
.setOutputCol("classIndex")
|
||||
.fit(rawInput)
|
||||
val labelTransformed = stringIndexer.transform(rawInput).drop("class")
|
||||
// compose all feature columns as vector
|
||||
val vectorAssembler = new VectorAssembler().
|
||||
setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")).
|
||||
setOutputCol("features")
|
||||
val xgbInput = vectorAssembler.transform(labelTransformed).select("features",
|
||||
"classIndex")
|
||||
|
||||
/**
|
||||
* setup "timeout_request_workers" -> 60000L to make this application if it cannot get enough resources
|
||||
* to get 2 workers within 60000 ms
|
||||
*
|
||||
* setup "checkpoint_path" -> "/checkpoints" and "checkpoint_interval" -> 2 to save checkpoint for every
|
||||
* two iterations
|
||||
*/
|
||||
val xgbParam = Map("eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "multi:softprob",
|
||||
"num_class" -> 3,
|
||||
"num_round" -> 100,
|
||||
"num_workers" -> 2)
|
||||
val xgbClassifier = new XGBoostClassifier(xgbParam).
|
||||
setFeaturesCol("features").
|
||||
setLabelCol("classIndex")
|
||||
val xgbClassificationModel = xgbClassifier.fit(xgbInput)
|
||||
val results = xgbClassificationModel.transform(xgbInput)
|
||||
results.show()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.Booster
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoost
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
object SparkWithDataFrame {
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length != 4) {
|
||||
println(
|
||||
"usage: program num_of_rounds num_workers training_path test_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
// create SparkSession
|
||||
val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
// val sqlContext = new SQLContext(new SparkContext(sparkConf))
|
||||
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
|
||||
// create training and testing dataframes
|
||||
val numRound = args(0).toInt
|
||||
val inputTrainPath = args(2)
|
||||
val inputTestPath = args(3)
|
||||
// build dataset
|
||||
val trainDF = sparkSession.sqlContext.read.format("libsvm").load(inputTrainPath)
|
||||
val testDF = sparkSession.sqlContext.read.format("libsvm").load(inputTestPath)
|
||||
// start training
|
||||
val paramMap = List(
|
||||
"eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgboostModel = XGBoost.trainWithDataFrame(
|
||||
trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
|
||||
// xgboost-spark appends the column containing prediction results
|
||||
xgboostModel.transform(testDF).show()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.Booster
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoost
|
||||
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector}
|
||||
import org.apache.spark.mllib.util.MLUtils
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object SparkWithRDD {
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length != 5) {
|
||||
println(
|
||||
"usage: program num_of_rounds num_workers training_path test_path model_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
implicit val sc = new SparkContext(sparkConf)
|
||||
val inputTrainPath = args(2)
|
||||
val inputTestPath = args(3)
|
||||
val outputModelPath = args(4)
|
||||
// number of iterations
|
||||
val numRound = args(0).toInt
|
||||
val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath).map(lp =>
|
||||
MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
|
||||
val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
|
||||
.map(lp => new MLDenseVector(lp.features.toArray))
|
||||
// training parameters
|
||||
val paramMap = List(
|
||||
"eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
|
||||
useExternalMemory = true)
|
||||
xgboostModel.predict(testSet, missingValue = Float.NaN)
|
||||
// save model to HDFS path
|
||||
xgboostModel.saveModelAsHadoopFile(outputModelPath)
|
||||
}
|
||||
}
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-flink</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
@@ -26,7 +26,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-spark</artifactId>
|
||||
<build>
|
||||
@@ -24,7 +24,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.Booster
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost}
|
||||
import org.apache.commons.logging.LogFactory
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.SparkContext
|
||||
@@ -64,9 +63,9 @@ private[spark] class CheckpointManager(sc: SparkContext, checkpointPath: String)
|
||||
val version = versions.max
|
||||
val fullPath = getPath(version)
|
||||
logger.info(s"Start training from previous booster at $fullPath")
|
||||
val booster = SXGBoost.loadModel(fullPath)
|
||||
booster.booster.setVersion(version)
|
||||
booster
|
||||
val model = XGBoost.loadModelFromHadoopFile(fullPath)(sc)
|
||||
model.booster.booster.setVersion(version)
|
||||
model.booster
|
||||
} else {
|
||||
null
|
||||
}
|
||||
@@ -77,12 +76,12 @@ private[spark] class CheckpointManager(sc: SparkContext, checkpointPath: String)
|
||||
*
|
||||
* @param checkpoint the checkpoint to save as an XGBoostModel
|
||||
*/
|
||||
private[spark] def updateCheckpoint(checkpoint: Booster): Unit = {
|
||||
private[spark] def updateCheckpoint(checkpoint: XGBoostModel): Unit = {
|
||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||
val prevModelPaths = getExistingVersions.map(version => new Path(getPath(version)))
|
||||
val fullPath = getPath(checkpoint.getVersion)
|
||||
logger.info(s"Saving checkpoint model with version ${checkpoint.getVersion} to $fullPath")
|
||||
checkpoint.saveModel(fullPath)
|
||||
val fullPath = getPath(checkpoint.version)
|
||||
logger.info(s"Saving checkpoint model with version ${checkpoint.version} to $fullPath")
|
||||
checkpoint.saveModelAsHadoopFile(fullPath)(sc)
|
||||
prevModelPaths.foreach(path => fs.delete(path, true))
|
||||
}
|
||||
|
||||
|
||||
@@ -21,15 +21,16 @@ import java.nio.file.Files
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.util.Random
|
||||
|
||||
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.commons.io.FileUtils
|
||||
import org.apache.commons.logging.LogFactory
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, Path}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.Dataset
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext}
|
||||
|
||||
|
||||
@@ -55,11 +56,11 @@ object TrackerConf {
|
||||
object XGBoost extends Serializable {
|
||||
private val logger = LogFactory.getLog("XGBoostSpark")
|
||||
|
||||
private[spark] def removeMissingValues(
|
||||
xgbLabelPoints: Iterator[XGBLabeledPoint],
|
||||
private def removeMissingValues(
|
||||
denseLabeledPoints: Iterator[XGBLabeledPoint],
|
||||
missing: Float): Iterator[XGBLabeledPoint] = {
|
||||
if (!missing.isNaN) {
|
||||
xgbLabelPoints.map { labeledPoint =>
|
||||
denseLabeledPoints.map { labeledPoint =>
|
||||
val indicesBuilder = new mutable.ArrayBuilder.ofInt()
|
||||
val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
|
||||
for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) {
|
||||
@@ -69,7 +70,7 @@ object XGBoost extends Serializable {
|
||||
labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
|
||||
}
|
||||
} else {
|
||||
xgbLabelPoints
|
||||
denseLabeledPoints
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,7 +134,7 @@ object XGBoost extends Serializable {
|
||||
fromBaseMarginsToArray(baseMargins), cacheDirName)
|
||||
|
||||
try {
|
||||
val numEarlyStoppingRounds = params.get("num_early_stopping_rounds")
|
||||
val numEarlyStoppingRounds = params.get("numEarlyStoppingRounds")
|
||||
.map(_.toString.toInt).getOrElse(0)
|
||||
val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](round))
|
||||
val booster = SXGBoost.train(watches.train, params, round,
|
||||
@@ -147,6 +148,89 @@ object XGBoost extends Serializable {
|
||||
}.cache()
|
||||
}
|
||||
|
||||
/**
|
||||
* Train XGBoost model with the DataFrame-represented data
|
||||
*
|
||||
* @param trainingData the training set represented as DataFrame
|
||||
* @param params Map containing the parameters to configure XGBoost
|
||||
* @param round the number of iterations
|
||||
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
|
||||
* workers equals to the partition number of trainingData RDD
|
||||
* @param obj An instance of [[ObjectiveTrait]] specifying a custom objective, null by default
|
||||
* @param eval An instance of [[EvalTrait]] specifying a custom evaluation metric, null by default
|
||||
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
|
||||
* true, the user may save the RAM cost for running XGBoost within Spark
|
||||
* @param missing The value which represents a missing value in the dataset
|
||||
* @param featureCol the name of input column, "features" as default value
|
||||
* @param labelCol the name of output column, "label" as default value
|
||||
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
|
||||
* @return XGBoostModel when successful training
|
||||
*/
|
||||
@throws(classOf[XGBoostError])
|
||||
def trainWithDataFrame(
|
||||
trainingData: Dataset[_],
|
||||
params: Map[String, Any],
|
||||
round: Int,
|
||||
nWorkers: Int,
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN,
|
||||
featureCol: String = "features",
|
||||
labelCol: String = "label"): XGBoostModel = {
|
||||
require(nWorkers > 0, "you must specify more than 0 workers")
|
||||
val estimator = new XGBoostEstimator(params)
|
||||
// assigning general parameters
|
||||
estimator.
|
||||
set(estimator.useExternalMemory, useExternalMemory).
|
||||
set(estimator.round, round).
|
||||
set(estimator.nWorkers, nWorkers).
|
||||
set(estimator.customObj, obj).
|
||||
set(estimator.customEval, eval).
|
||||
set(estimator.missing, missing).
|
||||
setFeaturesCol(featureCol).
|
||||
setLabelCol(labelCol).
|
||||
fit(trainingData)
|
||||
}
|
||||
|
||||
private[spark] def isClassificationTask(params: Map[String, Any]): Boolean = {
|
||||
val objective = params.getOrElse("objective", params.getOrElse("obj_type", null))
|
||||
objective != null && {
|
||||
val objStr = objective.toString
|
||||
objStr != "regression" && !objStr.startsWith("reg:") && objStr != "count:poisson" &&
|
||||
!objStr.startsWith("rank:")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Train XGBoost model with the RDD-represented data
|
||||
*
|
||||
* @param trainingData the training set represented as RDD
|
||||
* @param params Map containing the configuration entries
|
||||
* @param round the number of iterations
|
||||
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
|
||||
* workers equals to the partition number of trainingData RDD
|
||||
* @param obj An instance of [[ObjectiveTrait]] specifying a custom objective, null by default
|
||||
* @param eval An instance of [[EvalTrait]] specifying a custom evaluation metric, null by default
|
||||
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
|
||||
* true, the user may save the RAM cost for running XGBoost within Spark
|
||||
* @param missing the value represented the missing value in the dataset
|
||||
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
|
||||
* @return XGBoostModel when successful training
|
||||
*/
|
||||
@deprecated("Use XGBoost.trainWithRDD instead.")
|
||||
def train(
|
||||
trainingData: RDD[MLLabeledPoint],
|
||||
params: Map[String, Any],
|
||||
round: Int,
|
||||
nWorkers: Int,
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory, missing)
|
||||
}
|
||||
|
||||
private def overrideParamsAccordingToTaskCPUs(
|
||||
params: Map[String, Any],
|
||||
sc: SparkContext): Map[String, Any] = {
|
||||
@@ -175,8 +259,39 @@ object XGBoost extends Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return A tuple of the booster and the metrics used to build training summary
|
||||
* Train XGBoost model with the RDD-represented data
|
||||
*
|
||||
* @param trainingData the training set represented as RDD
|
||||
* @param params Map containing the configuration entries
|
||||
* @param round the number of iterations
|
||||
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
|
||||
* workers equals to the partition number of trainingData RDD
|
||||
* @param obj An instance of [[ObjectiveTrait]] specifying a custom objective, null by default
|
||||
* @param eval An instance of [[EvalTrait]] specifying a custom evaluation metric, null by default
|
||||
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
|
||||
* true, the user may save the RAM cost for running XGBoost within Spark
|
||||
* @param missing The value which represents a missing value in the dataset
|
||||
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training has failed
|
||||
* @return XGBoostModel when successful training
|
||||
*/
|
||||
@throws(classOf[XGBoostError])
|
||||
def trainWithRDD(
|
||||
trainingData: RDD[MLLabeledPoint],
|
||||
params: Map[String, Any],
|
||||
round: Int,
|
||||
nWorkers: Int,
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
import DataUtils._
|
||||
val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
|
||||
features.asXGB.copy(label = label.toFloat)
|
||||
}
|
||||
trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
|
||||
useExternalMemory, missing)
|
||||
}
|
||||
|
||||
@throws(classOf[XGBoostError])
|
||||
private[spark] def trainDistributed(
|
||||
trainingData: RDD[XGBLabeledPoint],
|
||||
@@ -186,7 +301,7 @@ object XGBoost extends Serializable {
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN): (Booster, Map[String, Array[Float]]) = {
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
if (params.contains("tree_method")) {
|
||||
require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" +
|
||||
" for now")
|
||||
@@ -222,8 +337,8 @@ object XGBoost extends Serializable {
|
||||
checkpointRound: Int =>
|
||||
val tracker = startTracker(nWorkers, trackerConf)
|
||||
try {
|
||||
val overriddenParams = overrideParamsAccordingToTaskCPUs(params, sc)
|
||||
val parallelismTracker = new SparkParallelismTracker(sc, timeoutRequestWorkers, nWorkers)
|
||||
val overriddenParams = overrideParamsAccordingToTaskCPUs(params, sc)
|
||||
val boostersAndMetrics = buildDistributedBoosters(partitionedData, overriddenParams,
|
||||
tracker.getWorkerEnvs, checkpointRound, obj, eval, useExternalMemory, missing,
|
||||
prevBooster)
|
||||
@@ -235,15 +350,20 @@ object XGBoost extends Serializable {
|
||||
}
|
||||
sparkJobThread.setUncaughtExceptionHandler(tracker)
|
||||
sparkJobThread.start()
|
||||
val isClsTask = isClassificationTask(params)
|
||||
val trackerReturnVal = parallelismTracker.execute(tracker.waitFor(0L))
|
||||
logger.info(s"Rabit returns with exit code $trackerReturnVal")
|
||||
val (booster, metrics) = postTrackerReturnProcessing(trackerReturnVal, boostersAndMetrics,
|
||||
sparkJobThread)
|
||||
if (checkpointRound < round) {
|
||||
prevBooster = booster
|
||||
checkpointManager.updateCheckpoint(prevBooster)
|
||||
val model = postTrackerReturnProcessing(trackerReturnVal, boostersAndMetrics,
|
||||
sparkJobThread, isClsTask)
|
||||
if (isClsTask){
|
||||
model.asInstanceOf[XGBoostClassificationModel].numOfClasses =
|
||||
params.getOrElse("num_class", "2").toString.toInt
|
||||
}
|
||||
(booster, metrics)
|
||||
if (checkpointRound < round) {
|
||||
prevBooster = model.booster
|
||||
checkpointManager.updateCheckpoint(model)
|
||||
}
|
||||
model
|
||||
} finally {
|
||||
tracker.stop()
|
||||
}
|
||||
@@ -263,14 +383,17 @@ object XGBoost extends Serializable {
|
||||
private def postTrackerReturnProcessing(
|
||||
trackerReturnVal: Int,
|
||||
distributedBoostersAndMetrics: RDD[(Booster, Map[String, Array[Float]])],
|
||||
sparkJobThread: Thread): (Booster, Map[String, Array[Float]]) = {
|
||||
sparkJobThread: Thread,
|
||||
isClassificationTask: Boolean
|
||||
): XGBoostModel = {
|
||||
if (trackerReturnVal == 0) {
|
||||
// Copies of the final booster and the corresponding metrics
|
||||
// reside in each partition of the `distributedBoostersAndMetrics`.
|
||||
// Any of them can be used to create the model.
|
||||
val (booster, metrics) = distributedBoostersAndMetrics.first()
|
||||
val xgboostModel = XGBoostModel(booster, isClassificationTask)
|
||||
distributedBoostersAndMetrics.unpersist(false)
|
||||
(booster, metrics)
|
||||
xgboostModel.setSummary(XGBoostTrainingSummary(metrics))
|
||||
} else {
|
||||
try {
|
||||
if (sparkJobThread.isAlive) {
|
||||
@@ -284,6 +407,64 @@ object XGBoost extends Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private def loadGeneralModelParams(inputStream: FSDataInputStream): (String, String, String) = {
|
||||
val featureCol = inputStream.readUTF()
|
||||
val labelCol = inputStream.readUTF()
|
||||
val predictionCol = inputStream.readUTF()
|
||||
(featureCol, labelCol, predictionCol)
|
||||
}
|
||||
|
||||
private def setGeneralModelParams(
|
||||
featureCol: String,
|
||||
labelCol: String,
|
||||
predCol: String,
|
||||
xgBoostModel: XGBoostModel): XGBoostModel = {
|
||||
xgBoostModel.setFeaturesCol(featureCol)
|
||||
xgBoostModel.setLabelCol(labelCol)
|
||||
xgBoostModel.setPredictionCol(predCol)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Load XGBoost model from path in HDFS-compatible file system
|
||||
*
|
||||
* @param modelPath The path of the file representing the model
|
||||
* @return The loaded model
|
||||
*/
|
||||
def loadModelFromHadoopFile(modelPath: String)(implicit sparkContext: SparkContext):
|
||||
XGBoostModel = {
|
||||
val path = new Path(modelPath)
|
||||
val dataInStream = path.getFileSystem(sparkContext.hadoopConfiguration).open(path)
|
||||
val modelType = dataInStream.readUTF()
|
||||
val (featureCol, labelCol, predictionCol) = loadGeneralModelParams(dataInStream)
|
||||
modelType match {
|
||||
case "_cls_" =>
|
||||
val rawPredictionCol = dataInStream.readUTF()
|
||||
val numClasses = dataInStream.readInt()
|
||||
val thresholdLength = dataInStream.readInt()
|
||||
var thresholds: Array[Double] = null
|
||||
if (thresholdLength != -1) {
|
||||
thresholds = new Array[Double](thresholdLength)
|
||||
for (i <- 0 until thresholdLength) {
|
||||
thresholds(i) = dataInStream.readDouble()
|
||||
}
|
||||
}
|
||||
val xgBoostModel = new XGBoostClassificationModel(SXGBoost.loadModel(dataInStream))
|
||||
setGeneralModelParams(featureCol, labelCol, predictionCol, xgBoostModel).
|
||||
asInstanceOf[XGBoostClassificationModel].setRawPredictionCol(rawPredictionCol)
|
||||
if (thresholdLength != -1) {
|
||||
xgBoostModel.setThresholds(thresholds)
|
||||
}
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].numOfClasses = numClasses
|
||||
xgBoostModel
|
||||
case "_reg_" =>
|
||||
val xgBoostModel = new XGBoostRegressionModel(SXGBoost.loadModel(dataInStream))
|
||||
setGeneralModelParams(featureCol, labelCol, predictionCol, xgBoostModel)
|
||||
case other =>
|
||||
throw new XGBoostError(s"Unknown model type $other. Supported types " +
|
||||
s"are: ['_reg_', '_cls_'].")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class Watches private(
|
||||
@@ -308,29 +489,12 @@ private class Watches private(
|
||||
|
||||
private object Watches {
|
||||
|
||||
def buildGroups(groups: Seq[Int]): Seq[Int] = {
|
||||
val output = mutable.ArrayBuffer.empty[Int]
|
||||
var count = 1
|
||||
var lastGroup = groups.head
|
||||
for (group <- groups.tail) {
|
||||
if (group != lastGroup) {
|
||||
lastGroup = group
|
||||
output += count
|
||||
count = 1
|
||||
} else {
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
output += count
|
||||
output
|
||||
}
|
||||
|
||||
def apply(
|
||||
params: Map[String, Any],
|
||||
labeledPoints: Iterator[XGBLabeledPoint],
|
||||
baseMarginsOpt: Option[Array[Float]],
|
||||
cacheDirName: Option[String]): Watches = {
|
||||
val trainTestRatio = params.get("train_test_ratio").map(_.toString.toDouble).getOrElse(1.0)
|
||||
val trainTestRatio = params.get("trainTestRatio").map(_.toString.toDouble).getOrElse(1.0)
|
||||
val seed = params.get("seed").map(_.toString.toLong).getOrElse(System.nanoTime())
|
||||
val r = new Random(seed)
|
||||
val testPoints = mutable.ArrayBuffer.empty[XGBLabeledPoint]
|
||||
@@ -342,18 +506,8 @@ private object Watches {
|
||||
|
||||
accepted
|
||||
}
|
||||
|
||||
val (trainIter1, trainIter2) = trainPoints.duplicate
|
||||
val trainMatrix = new DMatrix(trainIter1, cacheDirName.map(_ + "/train").orNull)
|
||||
val trainGroups = buildGroups(trainIter2.map(_.group).toSeq).toArray
|
||||
trainMatrix.setGroup(trainGroups)
|
||||
|
||||
val trainMatrix = new DMatrix(trainPoints, cacheDirName.map(_ + "/train").orNull)
|
||||
val testMatrix = new DMatrix(testPoints.iterator, cacheDirName.map(_ + "/test").orNull)
|
||||
if (trainTestRatio < 1.0) {
|
||||
val testGroups = buildGroups(testPoints.map(_.group)).toArray
|
||||
testMatrix.setGroup(testGroups)
|
||||
}
|
||||
|
||||
r.setSeed(seed)
|
||||
for (baseMargins <- baseMarginsOpt) {
|
||||
val (trainMargin, testMargin) = baseMargins.partition(_ => r.nextDouble() <= trainTestRatio)
|
||||
@@ -361,6 +515,11 @@ private object Watches {
|
||||
testMatrix.setBaseMargin(testMargin)
|
||||
}
|
||||
|
||||
// TODO: use group attribute from the points.
|
||||
if (params.contains("groupData") && params("groupData") != null) {
|
||||
trainMatrix.setGroup(params("groupData").asInstanceOf[Seq[Seq[Int]]](
|
||||
TaskContext.getPartitionId()).toArray)
|
||||
}
|
||||
new Watches(trainMatrix, testMatrix, cacheDirName)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.mutable
|
||||
import ml.dmlc.xgboost4j.scala.Booster
|
||||
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector, Vector => MLVector}
|
||||
import org.apache.spark.ml.param.{BooleanParam, DoubleArrayParam, Param, ParamMap}
|
||||
import org.apache.spark.ml.util.Identifiable
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{DataFrame, Dataset}
|
||||
|
||||
/**
|
||||
* class of the XGBoost model used for classification task
|
||||
*/
|
||||
class XGBoostClassificationModel private[spark](
|
||||
override val uid: String, booster: Booster)
|
||||
extends XGBoostModel(booster) {
|
||||
|
||||
def this(booster: Booster) = this(Identifiable.randomUID("XGBoostClassificationModel"), booster)
|
||||
|
||||
// only called in copy()
|
||||
def this(uid: String) = this(uid, null)
|
||||
|
||||
// scalastyle:off
|
||||
|
||||
/**
|
||||
* whether to output raw margin
|
||||
*/
|
||||
final val outputMargin = new BooleanParam(this, "outputMargin", "whether to output untransformed margin value")
|
||||
|
||||
setDefault(outputMargin, false)
|
||||
|
||||
def setOutputMargin(value: Boolean): XGBoostModel = set(outputMargin, value).asInstanceOf[XGBoostClassificationModel]
|
||||
|
||||
/**
|
||||
* the name of the column storing the raw prediction value, either probabilities (as default) or
|
||||
* raw margin value
|
||||
*/
|
||||
final val rawPredictionCol: Param[String] = new Param[String](this, "rawPredictionCol", "Column name for raw prediction output of xgboost. If outputMargin is true, the column contains untransformed margin value; otherwise it is the probability for each class (by default).")
|
||||
|
||||
setDefault(rawPredictionCol, "probabilities")
|
||||
|
||||
final def getRawPredictionCol: String = $(rawPredictionCol)
|
||||
|
||||
def setRawPredictionCol(value: String): XGBoostClassificationModel = set(rawPredictionCol, value).asInstanceOf[XGBoostClassificationModel]
|
||||
|
||||
/**
|
||||
* Thresholds in multi-class classification
|
||||
*/
|
||||
final val thresholds: DoubleArrayParam = new DoubleArrayParam(this, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold", (t: Array[Double]) => t.forall(_ >= 0))
|
||||
|
||||
def getThresholds: Array[Double] = $(thresholds)
|
||||
|
||||
def setThresholds(value: Array[Double]): XGBoostClassificationModel =
|
||||
set(thresholds, value).asInstanceOf[XGBoostClassificationModel]
|
||||
|
||||
// scalastyle:on
|
||||
|
||||
// generate dataframe containing raw prediction column which is typed as Vector
|
||||
private def predictRaw(
|
||||
testSet: Dataset[_],
|
||||
temporalColName: Option[String] = None,
|
||||
forceTransformedScore: Option[Boolean] = None): DataFrame = {
|
||||
val predictRDD = produceRowRDD(testSet, forceTransformedScore.getOrElse($(outputMargin)))
|
||||
val colName = temporalColName.getOrElse($(rawPredictionCol))
|
||||
val tempColName = colName + "_arraytype"
|
||||
val dsWithArrayTypedRawPredCol = testSet.sparkSession.createDataFrame(predictRDD, schema = {
|
||||
testSet.schema.add(tempColName, ArrayType(FloatType, containsNull = false))
|
||||
})
|
||||
val transformerForProbabilitiesArray =
|
||||
(rawPredArray: mutable.WrappedArray[Float]) =>
|
||||
if (numClasses == 2) {
|
||||
Array(1 - rawPredArray(0), rawPredArray(0)).map(_.toDouble)
|
||||
} else {
|
||||
rawPredArray.map(_.toDouble).array
|
||||
}
|
||||
dsWithArrayTypedRawPredCol.withColumn(colName,
|
||||
udf((rawPredArray: mutable.WrappedArray[Float]) =>
|
||||
new MLDenseVector(transformerForProbabilitiesArray(rawPredArray))).apply(col(tempColName))).
|
||||
drop(tempColName)
|
||||
}
|
||||
|
||||
private def fromFeatureToPrediction(testSet: Dataset[_]): Dataset[_] = {
|
||||
val rawPredictionDF = predictRaw(testSet, Some("rawPredictionCol"))
|
||||
val predictionUDF = udf(raw2prediction _).apply(col("rawPredictionCol"))
|
||||
val tempDF = rawPredictionDF.withColumn($(predictionCol), predictionUDF)
|
||||
val allColumnNames = testSet.columns ++ Seq($(predictionCol))
|
||||
tempDF.select(allColumnNames(0), allColumnNames.tail: _*)
|
||||
}
|
||||
|
||||
private def argMax(vector: Array[Double]): Double = {
|
||||
vector.zipWithIndex.maxBy(_._1)._2
|
||||
}
|
||||
|
||||
private def raw2prediction(rawPrediction: MLDenseVector): Double = {
|
||||
if (!isDefined(thresholds)) {
|
||||
argMax(rawPrediction.values)
|
||||
} else {
|
||||
probability2prediction(rawPrediction)
|
||||
}
|
||||
}
|
||||
|
||||
private def probability2prediction(probability: MLDenseVector): Double = {
|
||||
if (!isDefined(thresholds)) {
|
||||
argMax(probability.values)
|
||||
} else {
|
||||
val thresholds: Array[Double] = getThresholds
|
||||
val scaledProbability =
|
||||
probability.values.zip(thresholds).map { case (p, t) =>
|
||||
if (t == 0.0) Double.PositiveInfinity else p / t
|
||||
}
|
||||
argMax(scaledProbability)
|
||||
}
|
||||
}
|
||||
|
||||
override protected def transformImpl(testSet: Dataset[_]): DataFrame = {
|
||||
transformSchema(testSet.schema, logging = true)
|
||||
if (isDefined(thresholds)) {
|
||||
require($(thresholds).length == numClasses, this.getClass.getSimpleName +
|
||||
".transform() called with non-matching numClasses and thresholds.length." +
|
||||
s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
|
||||
}
|
||||
if ($(outputMargin)) {
|
||||
setRawPredictionCol("margin")
|
||||
}
|
||||
var outputData = testSet
|
||||
var numColsOutput = 0
|
||||
if ($(rawPredictionCol).nonEmpty) {
|
||||
outputData = predictRaw(testSet)
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if ($(predictionCol).nonEmpty) {
|
||||
if ($(rawPredictionCol).nonEmpty) {
|
||||
require(!$(outputMargin), "XGBoost does not support output final prediction with" +
|
||||
" untransformed margin. Please set predictionCol as \"\" when setting outputMargin as" +
|
||||
" true")
|
||||
val rawToPredUDF = udf(raw2prediction _).apply(col($(rawPredictionCol)))
|
||||
outputData = outputData.withColumn($(predictionCol), rawToPredUDF)
|
||||
} else {
|
||||
outputData = fromFeatureToPrediction(testSet)
|
||||
}
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if (numColsOutput == 0) {
|
||||
this.logWarning(s"$uid: XGBoostClassificationModel.transform() was called as NOOP" +
|
||||
" since no output columns were set.")
|
||||
}
|
||||
outputData.toDF()
|
||||
}
|
||||
|
||||
private[spark] var numOfClasses = 2
|
||||
|
||||
def numClasses: Int = numOfClasses
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostClassificationModel = {
|
||||
val newModel = copyValues(new XGBoostClassificationModel(booster), extra)
|
||||
newModel.setSummary(summary)
|
||||
}
|
||||
|
||||
override protected def predict(features: MLVector): Double = {
|
||||
throw new Exception("XGBoost does not support online prediction ")
|
||||
}
|
||||
}
|
||||
@@ -1,518 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.Iterator
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => SXGBoost}
|
||||
import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
|
||||
import ml.dmlc.xgboost4j.scala.spark.params._
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import org.apache.hadoop.fs.Path
|
||||
|
||||
import org.apache.spark.TaskContext
|
||||
import org.apache.spark.ml.classification._
|
||||
import org.apache.spark.ml.linalg._
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.ml.param.shared.HasWeightCol
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s.DefaultFormats
|
||||
|
||||
import org.apache.spark.broadcast.Broadcast
|
||||
|
||||
private[spark] trait XGBoostClassifierParams extends GeneralParams with LearningTaskParams
|
||||
with BoosterParams with HasWeightCol with HasBaseMarginCol with HasNumClass with ParamMapFuncs
|
||||
with HasLeafPredictionCol with HasContribPredictionCol
|
||||
|
||||
class XGBoostClassifier (
|
||||
override val uid: String,
|
||||
private val xgboostParams: Map[String, Any])
|
||||
extends ProbabilisticClassifier[Vector, XGBoostClassifier, XGBoostClassificationModel]
|
||||
with XGBoostClassifierParams with DefaultParamsWritable {
|
||||
|
||||
def this() = this(Identifiable.randomUID("xgbc"), Map[String, Any]())
|
||||
|
||||
def this(uid: String) = this(uid, Map[String, Any]())
|
||||
|
||||
def this(xgboostParams: Map[String, Any]) = this(
|
||||
Identifiable.randomUID("xgbc"), xgboostParams)
|
||||
|
||||
XGBoostToMLlibParams(xgboostParams)
|
||||
|
||||
def setWeightCol(value: String): this.type = set(weightCol, value)
|
||||
|
||||
def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value)
|
||||
|
||||
def setNumClass(value: Int): this.type = set(numClass, value)
|
||||
|
||||
// setters for general params
|
||||
def setNumRound(value: Int): this.type = set(numRound, value)
|
||||
|
||||
def setNumWorkers(value: Int): this.type = set(numWorkers, value)
|
||||
|
||||
def setNthread(value: Int): this.type = set(nthread, value)
|
||||
|
||||
def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value)
|
||||
|
||||
def setSilent(value: Int): this.type = set(silent, value)
|
||||
|
||||
def setMissing(value: Float): this.type = set(missing, value)
|
||||
|
||||
def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value)
|
||||
|
||||
def setCheckpointPath(value: String): this.type = set(checkpointPath, value)
|
||||
|
||||
def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
|
||||
|
||||
def setSeed(value: Long): this.type = set(seed, value)
|
||||
|
||||
def setEta(value: Double): this.type = set(eta, value)
|
||||
|
||||
def setGamma(value: Double): this.type = set(gamma, value)
|
||||
|
||||
def setMaxDepth(value: Int): this.type = set(maxDepth, value)
|
||||
|
||||
def setMinChildWeight(value: Double): this.type = set(minChildWeight, value)
|
||||
|
||||
def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value)
|
||||
|
||||
def setSubsample(value: Double): this.type = set(subsample, value)
|
||||
|
||||
def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value)
|
||||
|
||||
def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value)
|
||||
|
||||
def setLambda(value: Double): this.type = set(lambda, value)
|
||||
|
||||
def setAlpha(value: Double): this.type = set(alpha, value)
|
||||
|
||||
def setTreeMethod(value: String): this.type = set(treeMethod, value)
|
||||
|
||||
def setGrowPolicy(value: String): this.type = set(growPolicy, value)
|
||||
|
||||
def setMaxBins(value: Int): this.type = set(maxBins, value)
|
||||
|
||||
def setSketchEps(value: Double): this.type = set(sketchEps, value)
|
||||
|
||||
def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
|
||||
|
||||
def setSampleType(value: String): this.type = set(sampleType, value)
|
||||
|
||||
def setNormalizeType(value: String): this.type = set(normalizeType, value)
|
||||
|
||||
def setRateDrop(value: Double): this.type = set(rateDrop, value)
|
||||
|
||||
def setSkipDrop(value: Double): this.type = set(skipDrop, value)
|
||||
|
||||
def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
|
||||
|
||||
// setters for learning params
|
||||
def setObjective(value: String): this.type = set(objective, value)
|
||||
|
||||
def setBaseScore(value: Double): this.type = set(baseScore, value)
|
||||
|
||||
def setEvalMetric(value: String): this.type = set(evalMetric, value)
|
||||
|
||||
def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value)
|
||||
|
||||
def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
|
||||
|
||||
def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value)
|
||||
|
||||
def setCustomEval(value: EvalTrait): this.type = set(customEval, value)
|
||||
|
||||
// called at the start of fit/train when 'eval_metric' is not defined
|
||||
private def setupDefaultEvalMetric(): String = {
|
||||
require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
|
||||
if ($(objective).startsWith("multi")) {
|
||||
// multi
|
||||
"merror"
|
||||
} else {
|
||||
// binary
|
||||
"error"
|
||||
}
|
||||
}
|
||||
|
||||
override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = {
|
||||
|
||||
if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
|
||||
set(evalMetric, setupDefaultEvalMetric())
|
||||
}
|
||||
|
||||
val _numClasses = getNumClasses(dataset)
|
||||
if (isDefined(numClass) && $(numClass) != _numClasses) {
|
||||
throw new Exception("The number of classes in dataset doesn't match " +
|
||||
"\'num_class\' in xgboost params.")
|
||||
}
|
||||
|
||||
val weight = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
|
||||
val baseMargin = if (!isDefined(baseMarginCol) || $(baseMarginCol).isEmpty) {
|
||||
lit(Float.NaN)
|
||||
} else {
|
||||
col($(baseMarginCol))
|
||||
}
|
||||
|
||||
val instances: RDD[XGBLabeledPoint] = dataset.select(
|
||||
col($(featuresCol)),
|
||||
col($(labelCol)).cast(FloatType),
|
||||
baseMargin.cast(FloatType),
|
||||
weight.cast(FloatType)
|
||||
).rdd.map { case Row(features: Vector, label: Float, baseMargin: Float, weight: Float) =>
|
||||
val (indices, values) = features match {
|
||||
case v: SparseVector => (v.indices, v.values.map(_.toFloat))
|
||||
case v: DenseVector => (null, v.values.map(_.toFloat))
|
||||
}
|
||||
XGBLabeledPoint(label, indices, values, baseMargin = baseMargin, weight = weight)
|
||||
}
|
||||
transformSchema(dataset.schema, logging = true)
|
||||
val derivedXGBParamMap = MLlib2XGBoostParams
|
||||
// All non-null param maps in XGBoostClassifier are in derivedXGBParamMap.
|
||||
val (_booster, _metrics) = XGBoost.trainDistributed(instances, derivedXGBParamMap,
|
||||
$(numRound), $(numWorkers), $(customObj), $(customEval), $(useExternalMemory),
|
||||
$(missing))
|
||||
val model = new XGBoostClassificationModel(uid, _numClasses, _booster)
|
||||
val summary = XGBoostTrainingSummary(_metrics)
|
||||
model.setSummary(summary)
|
||||
model
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostClassifier = defaultCopy(extra)
|
||||
}
|
||||
|
||||
object XGBoostClassifier extends DefaultParamsReadable[XGBoostClassifier] {
|
||||
|
||||
override def load(path: String): XGBoostClassifier = super.load(path)
|
||||
}
|
||||
|
||||
class XGBoostClassificationModel private[ml](
|
||||
override val uid: String,
|
||||
override val numClasses: Int,
|
||||
private[spark] val _booster: Booster)
|
||||
extends ProbabilisticClassificationModel[Vector, XGBoostClassificationModel]
|
||||
with XGBoostClassifierParams with MLWritable with Serializable {
|
||||
|
||||
import XGBoostClassificationModel._
|
||||
|
||||
// only called in copy()
|
||||
def this(uid: String) = this(uid, 2, null)
|
||||
|
||||
/**
|
||||
* Get the native booster instance of this model.
|
||||
* This is used to call low-level APIs on native booster, such as "getFeatureScore".
|
||||
*/
|
||||
def nativeBooster: Booster = _booster
|
||||
|
||||
private var trainingSummary: Option[XGBoostTrainingSummary] = None
|
||||
|
||||
/**
|
||||
* Returns summary (e.g. train/test objective history) of model on the
|
||||
* training set. An exception is thrown if no summary is available.
|
||||
*/
|
||||
def summary: XGBoostTrainingSummary = trainingSummary.getOrElse {
|
||||
throw new IllegalStateException("No training summary available for this XGBoostModel")
|
||||
}
|
||||
|
||||
private[spark] def setSummary(summary: XGBoostTrainingSummary): this.type = {
|
||||
trainingSummary = Some(summary)
|
||||
this
|
||||
}
|
||||
|
||||
def setLeafPredictionCol(value: String): this.type = set(leafPredictionCol, value)
|
||||
|
||||
def setContribPredictionCol(value: String): this.type = set(contribPredictionCol, value)
|
||||
|
||||
def setTreeLimit(value: Int): this.type = set(treeLimit, value)
|
||||
|
||||
/**
|
||||
* Single instance prediction.
|
||||
* Note: The performance is not ideal, use it carefully!
|
||||
*/
|
||||
override def predict(features: Vector): Double = {
|
||||
import DataUtils._
|
||||
val dm = new DMatrix(XGBoost.removeMissingValues(Iterator(features.asXGB), $(missing)))
|
||||
val probability = _booster.predict(data = dm)(0)
|
||||
if (numClasses == 2) {
|
||||
math.round(probability(0))
|
||||
} else {
|
||||
Vectors.dense(probability.map(_.toDouble)).argmax
|
||||
}
|
||||
}
|
||||
|
||||
// Actually we don't use this function at all, to make it pass compiler check.
|
||||
override protected def predictRaw(features: Vector): Vector = {
|
||||
throw new Exception("XGBoost-Spark does not support \'predictRaw\'")
|
||||
}
|
||||
|
||||
// Actually we don't use this function at all, to make it pass compiler check.
|
||||
override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
|
||||
throw new Exception("XGBoost-Spark does not support \'raw2probabilityInPlace\'")
|
||||
}
|
||||
|
||||
// Generate raw prediction and probability prediction.
|
||||
private def transformInternal(dataset: Dataset[_]): DataFrame = {
|
||||
|
||||
val schema = StructType(dataset.schema.fields ++
|
||||
Seq(StructField(name = _rawPredictionCol, dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false)) ++
|
||||
Seq(StructField(name = _probabilityCol, dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false)))
|
||||
|
||||
val bBooster = dataset.sparkSession.sparkContext.broadcast(_booster)
|
||||
val appName = dataset.sparkSession.sparkContext.appName
|
||||
|
||||
val rdd = dataset.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIterator =>
|
||||
if (rowIterator.hasNext) {
|
||||
val rabitEnv = Array("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
val (rowItr1, rowItr2) = rowIterator.duplicate
|
||||
val featuresIterator = rowItr2.map(row => row.getAs[Vector](
|
||||
$(featuresCol))).toList.iterator
|
||||
import DataUtils._
|
||||
val cacheInfo = {
|
||||
if ($(useExternalMemory)) {
|
||||
s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}"
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
val dm = new DMatrix(
|
||||
XGBoost.removeMissingValues(featuresIterator.map(_.asXGB), $(missing)),
|
||||
cacheInfo)
|
||||
try {
|
||||
val Array(rawPredictionItr, probabilityItr, predLeafItr, predContribItr) =
|
||||
producePredictionItrs(bBooster, dm)
|
||||
Rabit.shutdown()
|
||||
produceResultIterator(rowItr1, rawPredictionItr, probabilityItr, predLeafItr,
|
||||
predContribItr)
|
||||
} finally {
|
||||
dm.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator[Row]()
|
||||
}
|
||||
}
|
||||
|
||||
bBooster.unpersist(blocking = false)
|
||||
|
||||
dataset.sparkSession.createDataFrame(rdd, generateResultSchema(schema))
|
||||
}
|
||||
|
||||
private def produceResultIterator(
|
||||
originalRowItr: Iterator[Row],
|
||||
rawPredictionItr: Iterator[Row],
|
||||
probabilityItr: Iterator[Row],
|
||||
predLeafItr: Iterator[Row],
|
||||
predContribItr: Iterator[Row]): Iterator[Row] = {
|
||||
// the following implementation is to be improved
|
||||
if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
|
||||
isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
|
||||
originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predLeafItr).zip(predContribItr).
|
||||
map { case ((((originals: Row, rawPrediction: Row), probability: Row), leaves: Row),
|
||||
contribs: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ leaves.toSeq ++
|
||||
contribs.toSeq)
|
||||
}
|
||||
} else if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
|
||||
(!isDefined(contribPredictionCol) || $(contribPredictionCol).isEmpty)) {
|
||||
originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predLeafItr).
|
||||
map { case (((originals: Row, rawPrediction: Row), probability: Row), leaves: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ leaves.toSeq)
|
||||
}
|
||||
} else if ((!isDefined(leafPredictionCol) || $(leafPredictionCol).isEmpty) &&
|
||||
isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
|
||||
originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predContribItr).
|
||||
map { case (((originals: Row, rawPrediction: Row), probability: Row), contribs: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ contribs.toSeq)
|
||||
}
|
||||
} else {
|
||||
originalRowItr.zip(rawPredictionItr).zip(probabilityItr).map {
|
||||
case ((originals: Row, rawPrediction: Row), probability: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def generateResultSchema(fixedSchema: StructType): StructType = {
|
||||
var resultSchema = fixedSchema
|
||||
if (isDefined(leafPredictionCol)) {
|
||||
resultSchema = resultSchema.add(StructField(name = $(leafPredictionCol), dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false))
|
||||
}
|
||||
if (isDefined(contribPredictionCol)) {
|
||||
resultSchema = resultSchema.add(StructField(name = $(contribPredictionCol), dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false))
|
||||
}
|
||||
resultSchema
|
||||
}
|
||||
|
||||
private def producePredictionItrs(broadcastBooster: Broadcast[Booster], dm: DMatrix):
|
||||
Array[Iterator[Row]] = {
|
||||
val rawPredictionItr = {
|
||||
broadcastBooster.value.predict(dm, outPutMargin = true, $(treeLimit)).
|
||||
map(Row(_)).iterator
|
||||
}
|
||||
val probabilityItr = {
|
||||
broadcastBooster.value.predict(dm, outPutMargin = false, $(treeLimit)).
|
||||
map(Row(_)).iterator
|
||||
}
|
||||
val predLeafItr = {
|
||||
if (isDefined(leafPredictionCol)) {
|
||||
broadcastBooster.value.predictLeaf(dm, $(treeLimit)).map(Row(_)).iterator
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
val predContribItr = {
|
||||
if (isDefined(contribPredictionCol)) {
|
||||
broadcastBooster.value.predictContrib(dm, $(treeLimit)).map(Row(_)).iterator
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
Array(rawPredictionItr, probabilityItr, predLeafItr, predContribItr)
|
||||
}
|
||||
|
||||
override def transform(dataset: Dataset[_]): DataFrame = {
|
||||
transformSchema(dataset.schema, logging = true)
|
||||
if (isDefined(thresholds)) {
|
||||
require($(thresholds).length == numClasses, this.getClass.getSimpleName +
|
||||
".transform() called with non-matching numClasses and thresholds.length." +
|
||||
s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
|
||||
}
|
||||
|
||||
// Output selected columns only.
|
||||
// This is a bit complicated since it tries to avoid repeated computation.
|
||||
var outputData = transformInternal(dataset)
|
||||
var numColsOutput = 0
|
||||
|
||||
val rawPredictionUDF = udf { rawPrediction: mutable.WrappedArray[Float] =>
|
||||
Vectors.dense(rawPrediction.map(_.toDouble).toArray)
|
||||
}
|
||||
|
||||
val probabilityUDF = udf { probability: mutable.WrappedArray[Float] =>
|
||||
if (numClasses == 2) {
|
||||
Vectors.dense(Array(1 - probability(0), probability(0)).map(_.toDouble))
|
||||
} else {
|
||||
Vectors.dense(probability.map(_.toDouble).toArray)
|
||||
}
|
||||
}
|
||||
|
||||
val predictUDF = udf { probability: mutable.WrappedArray[Float] =>
|
||||
// From XGBoost probability to MLlib prediction
|
||||
val probabilities = if (numClasses == 2) {
|
||||
Array(1 - probability(0), probability(0)).map(_.toDouble)
|
||||
} else {
|
||||
probability.map(_.toDouble).toArray
|
||||
}
|
||||
probability2prediction(Vectors.dense(probabilities))
|
||||
}
|
||||
|
||||
if ($(rawPredictionCol).nonEmpty) {
|
||||
outputData = outputData
|
||||
.withColumn(getRawPredictionCol, rawPredictionUDF(col(_rawPredictionCol)))
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if ($(probabilityCol).nonEmpty) {
|
||||
outputData = outputData
|
||||
.withColumn(getProbabilityCol, probabilityUDF(col(_probabilityCol)))
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if ($(predictionCol).nonEmpty) {
|
||||
outputData = outputData
|
||||
.withColumn($(predictionCol), predictUDF(col(_probabilityCol)))
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if (numColsOutput == 0) {
|
||||
this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
|
||||
" since no output columns were set.")
|
||||
}
|
||||
outputData
|
||||
.toDF
|
||||
.drop(col(_rawPredictionCol))
|
||||
.drop(col(_probabilityCol))
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostClassificationModel = {
|
||||
val newModel = copyValues(new XGBoostClassificationModel(uid, numClasses, _booster), extra)
|
||||
newModel.setSummary(summary).setParent(parent)
|
||||
}
|
||||
|
||||
override def write: MLWriter =
|
||||
new XGBoostClassificationModel.XGBoostClassificationModelWriter(this)
|
||||
}
|
||||
|
||||
object XGBoostClassificationModel extends MLReadable[XGBoostClassificationModel] {
|
||||
|
||||
private val _rawPredictionCol = "_rawPrediction"
|
||||
private val _probabilityCol = "_probability"
|
||||
|
||||
override def read: MLReader[XGBoostClassificationModel] = new XGBoostClassificationModelReader
|
||||
|
||||
override def load(path: String): XGBoostClassificationModel = super.load(path)
|
||||
|
||||
private[XGBoostClassificationModel]
|
||||
class XGBoostClassificationModelWriter(instance: XGBoostClassificationModel) extends MLWriter {
|
||||
|
||||
override protected def saveImpl(path: String): Unit = {
|
||||
// Save metadata and Params
|
||||
implicit val format = DefaultFormats
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
|
||||
DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
|
||||
// Save model data
|
||||
val dataPath = new Path(path, "data").toString
|
||||
val internalPath = new Path(dataPath, "XGBoostClassificationModel")
|
||||
val outputStream = internalPath.getFileSystem(sc.hadoopConfiguration).create(internalPath)
|
||||
outputStream.writeInt(instance.numClasses)
|
||||
instance._booster.saveModel(outputStream)
|
||||
outputStream.close()
|
||||
}
|
||||
}
|
||||
|
||||
private class XGBoostClassificationModelReader extends MLReader[XGBoostClassificationModel] {
|
||||
|
||||
/** Checked against metadata when loading model */
|
||||
private val className = classOf[XGBoostClassificationModel].getName
|
||||
|
||||
override def load(path: String): XGBoostClassificationModel = {
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
|
||||
|
||||
val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc, className)
|
||||
|
||||
val dataPath = new Path(path, "data").toString
|
||||
val internalPath = new Path(dataPath, "XGBoostClassificationModel")
|
||||
val dataInStream = internalPath.getFileSystem(sc.hadoopConfiguration).open(internalPath)
|
||||
val numClasses = dataInStream.readInt()
|
||||
|
||||
val booster = SXGBoost.loadModel(dataInStream)
|
||||
val model = new XGBoostClassificationModel(metadata.uid, numClasses, booster)
|
||||
DefaultXGBoostParamsReader.getAndSetParams(model, metadata)
|
||||
model
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.params._
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.spark.ml.Predictor
|
||||
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.FloatType
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
import org.json4s.DefaultFormats
|
||||
|
||||
/**
|
||||
* XGBoost Estimator to produce a XGBoost model
|
||||
*/
|
||||
class XGBoostEstimator private[spark](
|
||||
override val uid: String, xgboostParams: Map[String, Any])
|
||||
extends Predictor[Vector, XGBoostEstimator, XGBoostModel]
|
||||
with LearningTaskParams with GeneralParams with BoosterParams with MLWritable {
|
||||
|
||||
def this(xgboostParams: Map[String, Any]) =
|
||||
this(Identifiable.randomUID("XGBoostEstimator"), xgboostParams: Map[String, Any])
|
||||
|
||||
def this(uid: String) = this(uid, Map[String, Any]())
|
||||
|
||||
// called in fromXGBParamMapToParams only when eval_metric is not defined
|
||||
private def setupDefaultEvalMetric(): String = {
|
||||
val objFunc = xgboostParams.getOrElse("objective", xgboostParams.getOrElse("obj_type", null))
|
||||
if (objFunc == null) {
|
||||
"rmse"
|
||||
} else {
|
||||
// compute default metric based on specified objective
|
||||
val isClassificationTask = XGBoost.isClassificationTask(xgboostParams)
|
||||
if (!isClassificationTask) {
|
||||
// default metric for regression or ranking
|
||||
if (objFunc.toString.startsWith("rank")) {
|
||||
"map"
|
||||
} else {
|
||||
"rmse"
|
||||
}
|
||||
} else {
|
||||
// default metric for classification
|
||||
if (objFunc.toString.startsWith("multi")) {
|
||||
// multi
|
||||
"merror"
|
||||
} else {
|
||||
// binary
|
||||
"error"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def fromXGBParamMapToParams(): Unit = {
|
||||
for ((paramName, paramValue) <- xgboostParams) {
|
||||
params.find(_.name == paramName) match {
|
||||
case None =>
|
||||
case Some(_: DoubleParam) =>
|
||||
set(paramName, paramValue.toString.toDouble)
|
||||
case Some(_: BooleanParam) =>
|
||||
set(paramName, paramValue.toString.toBoolean)
|
||||
case Some(_: IntParam) =>
|
||||
set(paramName, paramValue.toString.toInt)
|
||||
case Some(_: FloatParam) =>
|
||||
set(paramName, paramValue.toString.toFloat)
|
||||
case Some(_: Param[_]) =>
|
||||
set(paramName, paramValue)
|
||||
}
|
||||
}
|
||||
if (xgboostParams.get("eval_metric").isEmpty) {
|
||||
set("eval_metric", setupDefaultEvalMetric())
|
||||
}
|
||||
}
|
||||
|
||||
fromXGBParamMapToParams()
|
||||
|
||||
private[spark] def fromParamsToXGBParamMap: Map[String, Any] = {
|
||||
val xgbParamMap = new mutable.HashMap[String, Any]()
|
||||
for (param <- params) {
|
||||
xgbParamMap += param.name -> $(param)
|
||||
}
|
||||
val r = xgbParamMap.toMap
|
||||
if (!XGBoost.isClassificationTask(r) || $(numClasses) == 2) {
|
||||
r - "num_class"
|
||||
} else {
|
||||
r
|
||||
}
|
||||
}
|
||||
|
||||
private def ensureColumns(trainingSet: Dataset[_]): Dataset[_] = {
|
||||
var newTrainingSet = trainingSet
|
||||
if (!trainingSet.columns.contains($(baseMarginCol))) {
|
||||
newTrainingSet = newTrainingSet.withColumn($(baseMarginCol), lit(Float.NaN))
|
||||
}
|
||||
if (!trainingSet.columns.contains($(weightCol))) {
|
||||
newTrainingSet = newTrainingSet.withColumn($(weightCol), lit(1.0))
|
||||
}
|
||||
newTrainingSet
|
||||
}
|
||||
|
||||
/**
|
||||
* produce a XGBoostModel by fitting the given dataset
|
||||
*/
|
||||
override def train(trainingSet: Dataset[_]): XGBoostModel = {
|
||||
val instances = ensureColumns(trainingSet).select(
|
||||
col($(featuresCol)),
|
||||
col($(labelCol)).cast(FloatType),
|
||||
col($(baseMarginCol)).cast(FloatType),
|
||||
col($(weightCol)).cast(FloatType)
|
||||
).rdd.map { case Row(features: Vector, label: Float, baseMargin: Float, weight: Float) =>
|
||||
val (indices, values) = features match {
|
||||
case v: SparseVector => (v.indices, v.values.map(_.toFloat))
|
||||
case v: DenseVector => (null, v.values.map(_.toFloat))
|
||||
}
|
||||
XGBLabeledPoint(label.toFloat, indices, values, baseMargin = baseMargin, weight = weight)
|
||||
}
|
||||
transformSchema(trainingSet.schema, logging = true)
|
||||
val derivedXGBoosterParamMap = fromParamsToXGBParamMap
|
||||
val trainedModel = XGBoost.trainDistributed(instances, derivedXGBoosterParamMap,
|
||||
$(round), $(nWorkers), $(customObj), $(customEval), $(useExternalMemory),
|
||||
$(missing)).setParent(this)
|
||||
val returnedModel = copyValues(trainedModel, extractParamMap())
|
||||
if (XGBoost.isClassificationTask(derivedXGBoosterParamMap)) {
|
||||
returnedModel.asInstanceOf[XGBoostClassificationModel].numOfClasses = $(numClasses)
|
||||
}
|
||||
returnedModel
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostEstimator = {
|
||||
defaultCopy(extra).asInstanceOf[XGBoostEstimator]
|
||||
}
|
||||
|
||||
override def write: MLWriter = new XGBoostEstimator.XGBoostEstimatorWriter(this)
|
||||
}
|
||||
|
||||
object XGBoostEstimator extends MLReadable[XGBoostEstimator] {
|
||||
|
||||
override def read: MLReader[XGBoostEstimator] = new XGBoostEstimatorReader
|
||||
|
||||
override def load(path: String): XGBoostEstimator = super.load(path)
|
||||
|
||||
private[XGBoostEstimator] class XGBoostEstimatorWriter(instance: XGBoostEstimator)
|
||||
extends MLWriter {
|
||||
override protected def saveImpl(path: String): Unit = {
|
||||
require(instance.fromParamsToXGBParamMap("custom_eval") == null &&
|
||||
instance.fromParamsToXGBParamMap("custom_obj") == null,
|
||||
"we do not support persist XGBoostEstimator with customized evaluator and objective" +
|
||||
" function for now")
|
||||
implicit val format = DefaultFormats
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
|
||||
}
|
||||
}
|
||||
|
||||
private class XGBoostEstimatorReader extends MLReader[XGBoostEstimator] {
|
||||
|
||||
override def load(path: String): XGBoostEstimator = {
|
||||
val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc)
|
||||
val cls = Utils.classForName(metadata.className)
|
||||
val instance =
|
||||
cls.getConstructor(classOf[String]).newInstance(metadata.uid).asInstanceOf[Params]
|
||||
DefaultXGBoostParamsReader.getAndSetParams(instance, metadata)
|
||||
instance.asInstanceOf[XGBoostEstimator]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,387 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.scala.spark.params.{BoosterParams, DefaultXGBoostParamsWriter}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
|
||||
|
||||
import org.apache.hadoop.fs.{FSDataOutputStream, Path}
|
||||
|
||||
import org.apache.spark.ml.PredictionModel
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector, Vector => MLVector}
|
||||
import org.apache.spark.ml.param.{BooleanParam, ParamMap, Params}
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.types.{ArrayType, FloatType}
|
||||
import org.apache.spark.{SparkContext, TaskContext}
|
||||
import org.json4s.DefaultFormats
|
||||
|
||||
/**
|
||||
* the base class of [[XGBoostClassificationModel]] and [[XGBoostRegressionModel]]
|
||||
*/
|
||||
abstract class XGBoostModel(protected var _booster: Booster)
|
||||
extends PredictionModel[MLVector, XGBoostModel] with BoosterParams with Serializable
|
||||
with Params with MLWritable {
|
||||
|
||||
private var trainingSummary: Option[XGBoostTrainingSummary] = None
|
||||
|
||||
/**
|
||||
* Returns summary (e.g. train/test objective history) of model on the
|
||||
* training set. An exception is thrown if no summary is available.
|
||||
*/
|
||||
def summary: XGBoostTrainingSummary = trainingSummary.getOrElse {
|
||||
throw new IllegalStateException("No training summary available for this XGBoostModel")
|
||||
}
|
||||
|
||||
private[spark] def setSummary(summary: XGBoostTrainingSummary): this.type = {
|
||||
trainingSummary = Some(summary)
|
||||
this
|
||||
}
|
||||
|
||||
def setLabelCol(name: String): XGBoostModel = set(labelCol, name)
|
||||
|
||||
// scalastyle:off
|
||||
|
||||
final val useExternalMemory = new BooleanParam(this, "use_external_memory",
|
||||
"whether to use external memory for prediction")
|
||||
|
||||
setDefault(useExternalMemory, false)
|
||||
|
||||
def setExternalMemory(value: Boolean): XGBoostModel = set(useExternalMemory, value)
|
||||
|
||||
// scalastyle:on
|
||||
|
||||
/**
|
||||
* Predict leaf instances with the given test set (represented as RDD)
|
||||
*
|
||||
* @param testSet test set represented as RDD
|
||||
*/
|
||||
def predictLeaves(testSet: RDD[MLVector]): RDD[Array[Float]] = {
|
||||
import DataUtils._
|
||||
val broadcastBooster = testSet.sparkContext.broadcast(_booster)
|
||||
testSet.mapPartitions { testSamples =>
|
||||
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
if (testSamples.nonEmpty) {
|
||||
val dMatrix = new DMatrix(testSamples.map(_.asXGB))
|
||||
try {
|
||||
broadcastBooster.value.predictLeaf(dMatrix).iterator
|
||||
} finally {
|
||||
Rabit.shutdown()
|
||||
dMatrix.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* evaluate XGBoostModel with a RDD-wrapped dataset
|
||||
*
|
||||
* NOTE: you have to specify value of either eval or iter; when you specify both, this method
|
||||
* adopts the default eval metric of model
|
||||
*
|
||||
* @param evalDataset the dataset used for evaluation
|
||||
* @param evalName the name of evaluation
|
||||
* @param evalFunc the customized evaluation function, null by default to use the default metric
|
||||
* of model
|
||||
* @param iter the current iteration, -1 to be null to use customized evaluation functions
|
||||
* @param groupData group data specify each group size for ranking task. Top level corresponds
|
||||
* to partition id, second level is the group sizes.
|
||||
* @return the average metric over all partitions
|
||||
*/
|
||||
def eval(evalDataset: RDD[MLLabeledPoint], evalName: String, evalFunc: EvalTrait = null,
|
||||
iter: Int = -1, useExternalCache: Boolean = false,
|
||||
groupData: Seq[Seq[Int]] = null): String = {
|
||||
require(evalFunc != null || iter != -1, "you have to specify the value of either eval or iter")
|
||||
val broadcastBooster = evalDataset.sparkContext.broadcast(_booster)
|
||||
val broadcastUseExternalCache = evalDataset.sparkContext.broadcast($(useExternalMemory))
|
||||
val appName = evalDataset.context.appName
|
||||
val allEvalMetrics = evalDataset.mapPartitions {
|
||||
labeledPointsPartition =>
|
||||
import DataUtils._
|
||||
if (labeledPointsPartition.hasNext) {
|
||||
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
val cacheFileName = {
|
||||
if (broadcastUseExternalCache.value) {
|
||||
s"$appName-${TaskContext.get().stageId()}-$evalName" +
|
||||
s"-deval_cache-${TaskContext.getPartitionId()}"
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
val dMatrix = new DMatrix(labeledPointsPartition.map(_.asXGB), cacheFileName)
|
||||
try {
|
||||
if (groupData != null) {
|
||||
dMatrix.setGroup(groupData(TaskContext.getPartitionId()).toArray)
|
||||
}
|
||||
(evalFunc, iter) match {
|
||||
case (null, _) => {
|
||||
val predStr = broadcastBooster.value.evalSet(Array(dMatrix), Array(evalName), iter)
|
||||
val Array(evName, predNumeric) = predStr.split(":")
|
||||
Iterator(Some(evName, predNumeric.toFloat))
|
||||
}
|
||||
case _ => {
|
||||
val predictions = broadcastBooster.value.predict(dMatrix)
|
||||
Iterator(Some((evalName, evalFunc.eval(predictions, dMatrix))))
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
Rabit.shutdown()
|
||||
dMatrix.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator(None)
|
||||
}
|
||||
}.filter(_.isDefined).collect()
|
||||
val evalPrefix = allEvalMetrics.map(_.get._1).head
|
||||
val evalMetricMean = allEvalMetrics.map(_.get._2).sum / allEvalMetrics.length
|
||||
s"$evalPrefix = $evalMetricMean"
|
||||
}
|
||||
|
||||
/**
|
||||
* Predict result with the given test set (represented as RDD)
|
||||
*
|
||||
* @param testSet test set represented as RDD
|
||||
* @param missingValue the specified value to represent the missing value
|
||||
*/
|
||||
def predict(testSet: RDD[MLDenseVector], missingValue: Float): RDD[Array[Float]] = {
|
||||
val broadcastBooster = testSet.sparkContext.broadcast(_booster)
|
||||
testSet.mapPartitions { testSamples =>
|
||||
val sampleArray = testSamples.toArray
|
||||
val numRows = sampleArray.length
|
||||
if (numRows == 0) {
|
||||
Iterator()
|
||||
} else {
|
||||
val numColumns = sampleArray.head.size
|
||||
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
// translate to required format
|
||||
val flatSampleArray = new Array[Float](numRows * numColumns)
|
||||
for (i <- flatSampleArray.indices) {
|
||||
flatSampleArray(i) = sampleArray(i / numColumns).values(i % numColumns).toFloat
|
||||
}
|
||||
val dMatrix = new DMatrix(flatSampleArray, numRows, numColumns, missingValue)
|
||||
try {
|
||||
broadcastBooster.value.predict(dMatrix).iterator
|
||||
} finally {
|
||||
Rabit.shutdown()
|
||||
dMatrix.delete()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Predict result with the given test set (represented as RDD)
|
||||
*
|
||||
* @param testSet test set represented as RDD
|
||||
* @param useExternalCache whether to use external cache for the test set
|
||||
* @param outputMargin whether to output raw untransformed margin value
|
||||
*/
|
||||
def predict(
|
||||
testSet: RDD[MLVector],
|
||||
useExternalCache: Boolean = false,
|
||||
outputMargin: Boolean = false): RDD[Array[Float]] = {
|
||||
val broadcastBooster = testSet.sparkContext.broadcast(_booster)
|
||||
val appName = testSet.context.appName
|
||||
testSet.mapPartitions { testSamples =>
|
||||
if (testSamples.nonEmpty) {
|
||||
import DataUtils._
|
||||
val rabitEnv = Array("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
val cacheFileName = {
|
||||
if (useExternalCache) {
|
||||
s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}"
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
val dMatrix = new DMatrix(testSamples.map(_.asXGB), cacheFileName)
|
||||
try {
|
||||
broadcastBooster.value.predict(dMatrix).iterator
|
||||
} finally {
|
||||
Rabit.shutdown()
|
||||
dMatrix.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected def transformImpl(testSet: Dataset[_]): DataFrame
|
||||
|
||||
/**
|
||||
* append leaf index of each row as an additional column in the original dataset
|
||||
*
|
||||
* @return the original dataframe with an additional column containing prediction results
|
||||
*/
|
||||
def transformLeaf(testSet: Dataset[_]): DataFrame = {
|
||||
val predictRDD = produceRowRDD(testSet, predLeaf = true)
|
||||
setPredictionCol("predLeaf")
|
||||
transformSchema(testSet.schema, logging = true)
|
||||
testSet.sparkSession.createDataFrame(predictRDD, testSet.schema.add($(predictionCol),
|
||||
ArrayType(FloatType, containsNull = false)))
|
||||
}
|
||||
|
||||
protected def produceRowRDD(testSet: Dataset[_], outputMargin: Boolean = false,
|
||||
predLeaf: Boolean = false): RDD[Row] = {
|
||||
val broadcastBooster = testSet.sparkSession.sparkContext.broadcast(_booster)
|
||||
val appName = testSet.sparkSession.sparkContext.appName
|
||||
testSet.rdd.mapPartitions {
|
||||
rowIterator =>
|
||||
if (rowIterator.hasNext) {
|
||||
val rabitEnv = Array("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
val (rowItr1, rowItr2) = rowIterator.duplicate
|
||||
val vectorIterator = rowItr2.map(row => row.asInstanceOf[Row].getAs[MLVector](
|
||||
$(featuresCol))).toList.iterator
|
||||
import DataUtils._
|
||||
val cachePrefix = {
|
||||
if ($(useExternalMemory)) {
|
||||
s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}"
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
val testDataset = new DMatrix(vectorIterator.map(_.asXGB), cachePrefix)
|
||||
try {
|
||||
val rawPredictResults = {
|
||||
if (!predLeaf) {
|
||||
broadcastBooster.value.predict(testDataset, outputMargin).map(Row(_)).iterator
|
||||
} else {
|
||||
broadcastBooster.value.predictLeaf(testDataset).map(Row(_)).iterator
|
||||
}
|
||||
}
|
||||
Rabit.shutdown()
|
||||
// concatenate original data partition and predictions
|
||||
rowItr1.zip(rawPredictResults).map {
|
||||
case (originalColumns: Row, predictColumn: Row) =>
|
||||
Row.fromSeq(originalColumns.toSeq ++ predictColumn.toSeq)
|
||||
}
|
||||
} finally {
|
||||
testDataset.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator[Row]()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* produces the prediction results and append as an additional column in the original dataset
|
||||
* NOTE: the prediction results is kept as the original format of xgboost
|
||||
*
|
||||
* @return the original dataframe with an additional column containing prediction results
|
||||
*/
|
||||
override def transform(testSet: Dataset[_]): DataFrame = {
|
||||
transformImpl(testSet)
|
||||
}
|
||||
|
||||
private def saveGeneralModelParam(outputStream: FSDataOutputStream): Unit = {
|
||||
outputStream.writeUTF(getFeaturesCol)
|
||||
outputStream.writeUTF(getLabelCol)
|
||||
outputStream.writeUTF(getPredictionCol)
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the model as to HDFS-compatible file system.
|
||||
*
|
||||
* @param modelPath The model path as in Hadoop path.
|
||||
*/
|
||||
def saveModelAsHadoopFile(modelPath: String)(implicit sc: SparkContext): Unit = {
|
||||
val path = new Path(modelPath)
|
||||
val outputStream = path.getFileSystem(sc.hadoopConfiguration).create(path)
|
||||
// output model type
|
||||
this match {
|
||||
case model: XGBoostClassificationModel =>
|
||||
outputStream.writeUTF("_cls_")
|
||||
saveGeneralModelParam(outputStream)
|
||||
outputStream.writeUTF(model.getRawPredictionCol)
|
||||
outputStream.writeInt(model.numClasses)
|
||||
// threshold
|
||||
// threshold length
|
||||
if (!isDefined(model.thresholds)) {
|
||||
outputStream.writeInt(-1)
|
||||
} else {
|
||||
val thresholdLength = model.getThresholds.length
|
||||
outputStream.writeInt(thresholdLength)
|
||||
for (i <- 0 until thresholdLength) {
|
||||
outputStream.writeDouble(model.getThresholds(i))
|
||||
}
|
||||
}
|
||||
case model: XGBoostRegressionModel =>
|
||||
outputStream.writeUTF("_reg_")
|
||||
// eventual prediction col
|
||||
saveGeneralModelParam(outputStream)
|
||||
}
|
||||
// booster
|
||||
_booster.saveModel(outputStream)
|
||||
outputStream.close()
|
||||
}
|
||||
|
||||
def booster: Booster = _booster
|
||||
|
||||
def version: Int = this.booster.booster.getVersion
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostModel = defaultCopy(extra)
|
||||
|
||||
override def write: MLWriter = new XGBoostModel.XGBoostModelModelWriter(this)
|
||||
}
|
||||
|
||||
object XGBoostModel extends MLReadable[XGBoostModel] {
|
||||
private[spark] def apply(booster: Booster, isClassification: Boolean): XGBoostModel = {
|
||||
if (!isClassification) {
|
||||
new XGBoostRegressionModel(booster)
|
||||
} else {
|
||||
new XGBoostClassificationModel(booster)
|
||||
}
|
||||
}
|
||||
|
||||
override def read: MLReader[XGBoostModel] = new XGBoostModelModelReader
|
||||
|
||||
override def load(path: String): XGBoostModel = super.load(path)
|
||||
|
||||
private[XGBoostModel] class XGBoostModelModelWriter(instance: XGBoostModel) extends MLWriter {
|
||||
override protected def saveImpl(path: String): Unit = {
|
||||
implicit val format = DefaultFormats
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
|
||||
val dataPath = new Path(path, "data").toString
|
||||
instance.saveModelAsHadoopFile(dataPath)
|
||||
}
|
||||
}
|
||||
|
||||
private class XGBoostModelModelReader extends MLReader[XGBoostModel] {
|
||||
|
||||
override def load(path: String): XGBoostModel = {
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
val dataPath = new Path(path, "data").toString
|
||||
// not used / all data resides in platform independent xgboost model file
|
||||
// val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc, className)
|
||||
XGBoost.loadModelFromHadoopFile(dataPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.Booster
|
||||
import org.apache.spark.ml.linalg.{Vector => MLVector}
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.ml.util.Identifiable
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.{ArrayType, FloatType}
|
||||
|
||||
/**
|
||||
* class of XGBoost model used for regression task
|
||||
*/
|
||||
class XGBoostRegressionModel private[spark](override val uid: String, booster: Booster)
|
||||
extends XGBoostModel(booster) {
|
||||
|
||||
def this(_booster: Booster) = this(Identifiable.randomUID("XGBoostRegressionModel"), _booster)
|
||||
|
||||
// only called in copy()
|
||||
def this(uid: String) = this(uid, null)
|
||||
|
||||
override protected def transformImpl(testSet: Dataset[_]): DataFrame = {
|
||||
transformSchema(testSet.schema, logging = true)
|
||||
val predictRDD = produceRowRDD(testSet)
|
||||
val tempPredColName = $(predictionCol) + "_temp"
|
||||
val transformerForArrayTypedPredCol =
|
||||
udf((regressionResults: mutable.WrappedArray[Float]) => regressionResults(0))
|
||||
testSet.sparkSession.createDataFrame(predictRDD,
|
||||
schema = testSet.schema.add(tempPredColName, ArrayType(FloatType, containsNull = false))
|
||||
).withColumn(
|
||||
$(predictionCol),
|
||||
transformerForArrayTypedPredCol.apply(col(tempPredColName))).drop(tempPredColName)
|
||||
}
|
||||
|
||||
override protected def predict(features: MLVector): Double = {
|
||||
throw new Exception("XGBoost does not support online prediction for now")
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostRegressionModel = {
|
||||
val newModel = copyValues(new XGBoostRegressionModel(booster), extra)
|
||||
newModel.setSummary(summary)
|
||||
}
|
||||
}
|
||||
@@ -1,443 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.Iterator
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import ml.dmlc.xgboost4j.scala.spark.params.{DefaultXGBoostParamsReader, _}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => SXGBoost}
|
||||
import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
|
||||
import org.apache.hadoop.fs.Path
|
||||
|
||||
import org.apache.spark.TaskContext
|
||||
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
|
||||
import org.apache.spark.ml.param.shared.HasWeightCol
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.ml._
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types._
|
||||
import org.json4s.DefaultFormats
|
||||
import scala.collection.mutable
|
||||
|
||||
import org.apache.spark.broadcast.Broadcast
|
||||
|
||||
private[spark] trait XGBoostRegressorParams extends GeneralParams with BoosterParams
|
||||
with LearningTaskParams with HasBaseMarginCol with HasWeightCol with HasGroupCol
|
||||
with ParamMapFuncs with HasLeafPredictionCol with HasContribPredictionCol
|
||||
|
||||
class XGBoostRegressor (
|
||||
override val uid: String,
|
||||
private val xgboostParams: Map[String, Any])
|
||||
extends Predictor[Vector, XGBoostRegressor, XGBoostRegressionModel]
|
||||
with XGBoostRegressorParams with DefaultParamsWritable {
|
||||
|
||||
def this() = this(Identifiable.randomUID("xgbr"), Map[String, Any]())
|
||||
|
||||
def this(uid: String) = this(uid, Map[String, Any]())
|
||||
|
||||
def this(xgboostParams: Map[String, Any]) = this(
|
||||
Identifiable.randomUID("xgbr"), xgboostParams)
|
||||
|
||||
XGBoostToMLlibParams(xgboostParams)
|
||||
|
||||
def setWeightCol(value: String): this.type = set(weightCol, value)
|
||||
|
||||
def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value)
|
||||
|
||||
def setGroupCol(value: String): this.type = set(groupCol, value)
|
||||
|
||||
// setters for general params
|
||||
def setNumRound(value: Int): this.type = set(numRound, value)
|
||||
|
||||
def setNumWorkers(value: Int): this.type = set(numWorkers, value)
|
||||
|
||||
def setNthread(value: Int): this.type = set(nthread, value)
|
||||
|
||||
def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value)
|
||||
|
||||
def setSilent(value: Int): this.type = set(silent, value)
|
||||
|
||||
def setMissing(value: Float): this.type = set(missing, value)
|
||||
|
||||
def setTimeoutRequestWorkers(value: Long): this.type = set(timeoutRequestWorkers, value)
|
||||
|
||||
def setCheckpointPath(value: String): this.type = set(checkpointPath, value)
|
||||
|
||||
def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
|
||||
|
||||
def setSeed(value: Long): this.type = set(seed, value)
|
||||
|
||||
def setEta(value: Double): this.type = set(eta, value)
|
||||
|
||||
def setGamma(value: Double): this.type = set(gamma, value)
|
||||
|
||||
def setMaxDepth(value: Int): this.type = set(maxDepth, value)
|
||||
|
||||
def setMinChildWeight(value: Double): this.type = set(minChildWeight, value)
|
||||
|
||||
def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value)
|
||||
|
||||
def setSubsample(value: Double): this.type = set(subsample, value)
|
||||
|
||||
def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value)
|
||||
|
||||
def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value)
|
||||
|
||||
def setLambda(value: Double): this.type = set(lambda, value)
|
||||
|
||||
def setAlpha(value: Double): this.type = set(alpha, value)
|
||||
|
||||
def setTreeMethod(value: String): this.type = set(treeMethod, value)
|
||||
|
||||
def setGrowPolicy(value: String): this.type = set(growPolicy, value)
|
||||
|
||||
def setMaxBins(value: Int): this.type = set(maxBins, value)
|
||||
|
||||
def setSketchEps(value: Double): this.type = set(sketchEps, value)
|
||||
|
||||
def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
|
||||
|
||||
def setSampleType(value: String): this.type = set(sampleType, value)
|
||||
|
||||
def setNormalizeType(value: String): this.type = set(normalizeType, value)
|
||||
|
||||
def setRateDrop(value: Double): this.type = set(rateDrop, value)
|
||||
|
||||
def setSkipDrop(value: Double): this.type = set(skipDrop, value)
|
||||
|
||||
def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
|
||||
|
||||
// setters for learning params
|
||||
def setObjective(value: String): this.type = set(objective, value)
|
||||
|
||||
def setBaseScore(value: Double): this.type = set(baseScore, value)
|
||||
|
||||
def setEvalMetric(value: String): this.type = set(evalMetric, value)
|
||||
|
||||
def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value)
|
||||
|
||||
def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
|
||||
|
||||
def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value)
|
||||
|
||||
def setCustomEval(value: EvalTrait): this.type = set(customEval, value)
|
||||
|
||||
// called at the start of fit/train when 'eval_metric' is not defined
|
||||
private def setupDefaultEvalMetric(): String = {
|
||||
require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
|
||||
if ($(objective).startsWith("rank")) {
|
||||
"map"
|
||||
} else {
|
||||
"rmse"
|
||||
}
|
||||
}
|
||||
|
||||
override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = {
|
||||
|
||||
if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
|
||||
set(evalMetric, setupDefaultEvalMetric())
|
||||
}
|
||||
|
||||
val weight = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
|
||||
val baseMargin = if (!isDefined(baseMarginCol) || $(baseMarginCol).isEmpty) {
|
||||
lit(Float.NaN)
|
||||
} else {
|
||||
col($(baseMarginCol))
|
||||
}
|
||||
val group = if (!isDefined(groupCol) || $(groupCol).isEmpty) lit(-1) else col($(groupCol))
|
||||
|
||||
val instances: RDD[XGBLabeledPoint] = dataset.select(
|
||||
col($(labelCol)).cast(FloatType),
|
||||
col($(featuresCol)),
|
||||
weight.cast(FloatType),
|
||||
group.cast(IntegerType),
|
||||
baseMargin.cast(FloatType)
|
||||
).rdd.map {
|
||||
case Row(label: Float, features: Vector, weight: Float, group: Int, baseMargin: Float) =>
|
||||
val (indices, values) = features match {
|
||||
case v: SparseVector => (v.indices, v.values.map(_.toFloat))
|
||||
case v: DenseVector => (null, v.values.map(_.toFloat))
|
||||
}
|
||||
XGBLabeledPoint(label, indices, values, weight, group, baseMargin)
|
||||
}
|
||||
transformSchema(dataset.schema, logging = true)
|
||||
val derivedXGBParamMap = MLlib2XGBoostParams
|
||||
// All non-null param maps in XGBoostRegressor are in derivedXGBParamMap.
|
||||
val (_booster, _metrics) = XGBoost.trainDistributed(instances, derivedXGBParamMap,
|
||||
$(numRound), $(numWorkers), $(customObj), $(customEval), $(useExternalMemory),
|
||||
$(missing))
|
||||
val model = new XGBoostRegressionModel(uid, _booster)
|
||||
val summary = XGBoostTrainingSummary(_metrics)
|
||||
model.setSummary(summary)
|
||||
model
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostRegressor = defaultCopy(extra)
|
||||
}
|
||||
|
||||
object XGBoostRegressor extends DefaultParamsReadable[XGBoostRegressor] {
|
||||
|
||||
override def load(path: String): XGBoostRegressor = super.load(path)
|
||||
}
|
||||
|
||||
class XGBoostRegressionModel private[ml] (
|
||||
override val uid: String,
|
||||
private[spark] val _booster: Booster)
|
||||
extends PredictionModel[Vector, XGBoostRegressionModel]
|
||||
with XGBoostRegressorParams with MLWritable with Serializable {
|
||||
|
||||
import XGBoostRegressionModel._
|
||||
|
||||
// only called in copy()
|
||||
def this(uid: String) = this(uid, null)
|
||||
|
||||
/**
|
||||
* Get the native booster instance of this model.
|
||||
* This is used to call low-level APIs on native booster, such as "getFeatureScore".
|
||||
*/
|
||||
def nativeBooster: Booster = _booster
|
||||
|
||||
private var trainingSummary: Option[XGBoostTrainingSummary] = None
|
||||
|
||||
/**
|
||||
* Returns summary (e.g. train/test objective history) of model on the
|
||||
* training set. An exception is thrown if no summary is available.
|
||||
*/
|
||||
def summary: XGBoostTrainingSummary = trainingSummary.getOrElse {
|
||||
throw new IllegalStateException("No training summary available for this XGBoostModel")
|
||||
}
|
||||
|
||||
private[spark] def setSummary(summary: XGBoostTrainingSummary): this.type = {
|
||||
trainingSummary = Some(summary)
|
||||
this
|
||||
}
|
||||
|
||||
def setLeafPredictionCol(value: String): this.type = set(leafPredictionCol, value)
|
||||
|
||||
def setContribPredictionCol(value: String): this.type = set(contribPredictionCol, value)
|
||||
|
||||
def setTreeLimit(value: Int): this.type = set(treeLimit, value)
|
||||
|
||||
/**
|
||||
* Single instance prediction.
|
||||
* Note: The performance is not ideal, use it carefully!
|
||||
*/
|
||||
override def predict(features: Vector): Double = {
|
||||
import DataUtils._
|
||||
val dm = new DMatrix(XGBoost.removeMissingValues(Iterator(features.asXGB), $(missing)))
|
||||
_booster.predict(data = dm)(0)(0)
|
||||
}
|
||||
|
||||
private def transformInternal(dataset: Dataset[_]): DataFrame = {
|
||||
|
||||
val schema = StructType(dataset.schema.fields ++
|
||||
Seq(StructField(name = _originalPredictionCol, dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false)))
|
||||
|
||||
val bBooster = dataset.sparkSession.sparkContext.broadcast(_booster)
|
||||
val appName = dataset.sparkSession.sparkContext.appName
|
||||
|
||||
val rdd = dataset.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIterator =>
|
||||
if (rowIterator.hasNext) {
|
||||
val rabitEnv = Array("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
val (rowItr1, rowItr2) = rowIterator.duplicate
|
||||
val featuresIterator = rowItr2.map(row => row.getAs[Vector](
|
||||
$(featuresCol))).toList.iterator
|
||||
import DataUtils._
|
||||
val cacheInfo = {
|
||||
if ($(useExternalMemory)) {
|
||||
s"$appName-${TaskContext.get().stageId()}-dtest_cache-${TaskContext.getPartitionId()}"
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
val dm = new DMatrix(
|
||||
XGBoost.removeMissingValues(featuresIterator.map(_.asXGB), $(missing)),
|
||||
cacheInfo)
|
||||
try {
|
||||
val Array(originalPredictionItr, predLeafItr, predContribItr) =
|
||||
producePredictionItrs(bBooster, dm)
|
||||
Rabit.shutdown()
|
||||
produceResultIterator(rowItr1, originalPredictionItr, predLeafItr, predContribItr)
|
||||
} finally {
|
||||
dm.delete()
|
||||
}
|
||||
} else {
|
||||
Iterator[Row]()
|
||||
}
|
||||
}
|
||||
bBooster.unpersist(blocking = false)
|
||||
dataset.sparkSession.createDataFrame(rdd, generateResultSchema(schema))
|
||||
}
|
||||
|
||||
private def produceResultIterator(
|
||||
originalRowItr: Iterator[Row],
|
||||
predictionItr: Iterator[Row],
|
||||
predLeafItr: Iterator[Row],
|
||||
predContribItr: Iterator[Row]): Iterator[Row] = {
|
||||
// the following implementation is to be improved
|
||||
if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
|
||||
isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
|
||||
originalRowItr.zip(predictionItr).zip(predLeafItr).zip(predContribItr).
|
||||
map { case (((originals: Row, prediction: Row), leaves: Row), contribs: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ leaves.toSeq ++ contribs.toSeq)
|
||||
}
|
||||
} else if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
|
||||
(!isDefined(contribPredictionCol) || $(contribPredictionCol).isEmpty)) {
|
||||
originalRowItr.zip(predictionItr).zip(predLeafItr).
|
||||
map { case ((originals: Row, prediction: Row), leaves: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ leaves.toSeq)
|
||||
}
|
||||
} else if ((!isDefined(leafPredictionCol) || $(leafPredictionCol).isEmpty) &&
|
||||
isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
|
||||
originalRowItr.zip(predictionItr).zip(predContribItr).
|
||||
map { case ((originals: Row, prediction: Row), contribs: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ contribs.toSeq)
|
||||
}
|
||||
} else {
|
||||
originalRowItr.zip(predictionItr).map {
|
||||
case (originals: Row, originalPrediction: Row) =>
|
||||
Row.fromSeq(originals.toSeq ++ originalPrediction.toSeq)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def generateResultSchema(fixedSchema: StructType): StructType = {
|
||||
var resultSchema = fixedSchema
|
||||
if (isDefined(leafPredictionCol)) {
|
||||
resultSchema = resultSchema.add(StructField(name = $(leafPredictionCol), dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false))
|
||||
}
|
||||
if (isDefined(contribPredictionCol)) {
|
||||
resultSchema = resultSchema.add(StructField(name = $(contribPredictionCol), dataType =
|
||||
ArrayType(FloatType, containsNull = false), nullable = false))
|
||||
}
|
||||
resultSchema
|
||||
}
|
||||
|
||||
private def producePredictionItrs(broadcastBooster: Broadcast[Booster], dm: DMatrix):
|
||||
Array[Iterator[Row]] = {
|
||||
val originalPredictionItr = {
|
||||
broadcastBooster.value.predict(dm, outPutMargin = false, $(treeLimit)).map(Row(_)).iterator
|
||||
}
|
||||
val predLeafItr = {
|
||||
if (isDefined(leafPredictionCol)) {
|
||||
broadcastBooster.value.predictLeaf(dm, $(treeLimit)).
|
||||
map(Row(_)).iterator
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
val predContribItr = {
|
||||
if (isDefined(contribPredictionCol)) {
|
||||
broadcastBooster.value.predictContrib(dm, $(treeLimit)).
|
||||
map(Row(_)).iterator
|
||||
} else {
|
||||
Iterator()
|
||||
}
|
||||
}
|
||||
Array(originalPredictionItr, predLeafItr, predContribItr)
|
||||
}
|
||||
|
||||
override def transform(dataset: Dataset[_]): DataFrame = {
|
||||
transformSchema(dataset.schema, logging = true)
|
||||
|
||||
// Output selected columns only.
|
||||
// This is a bit complicated since it tries to avoid repeated computation.
|
||||
var outputData = transformInternal(dataset)
|
||||
var numColsOutput = 0
|
||||
|
||||
val predictUDF = udf { (originalPrediction: mutable.WrappedArray[Float]) =>
|
||||
originalPrediction(0).toDouble
|
||||
}
|
||||
|
||||
if ($(predictionCol).nonEmpty) {
|
||||
outputData = outputData
|
||||
.withColumn($(predictionCol), predictUDF(col(_originalPredictionCol)))
|
||||
numColsOutput += 1
|
||||
}
|
||||
|
||||
if (numColsOutput == 0) {
|
||||
this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
|
||||
" since no output columns were set.")
|
||||
}
|
||||
outputData.toDF.drop(col(_originalPredictionCol))
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): XGBoostRegressionModel = {
|
||||
val newModel = copyValues(new XGBoostRegressionModel(uid, _booster), extra)
|
||||
newModel.setSummary(summary).setParent(parent)
|
||||
}
|
||||
|
||||
override def write: MLWriter =
|
||||
new XGBoostRegressionModel.XGBoostRegressionModelWriter(this)
|
||||
}
|
||||
|
||||
object XGBoostRegressionModel extends MLReadable[XGBoostRegressionModel] {
|
||||
|
||||
private val _originalPredictionCol = "_originalPrediction"
|
||||
|
||||
override def read: MLReader[XGBoostRegressionModel] = new XGBoostRegressionModelReader
|
||||
|
||||
override def load(path: String): XGBoostRegressionModel = super.load(path)
|
||||
|
||||
private[XGBoostRegressionModel]
|
||||
class XGBoostRegressionModelWriter(instance: XGBoostRegressionModel) extends MLWriter {
|
||||
|
||||
override protected def saveImpl(path: String): Unit = {
|
||||
// Save metadata and Params
|
||||
implicit val format = DefaultFormats
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
|
||||
// Save model data
|
||||
val dataPath = new Path(path, "data").toString
|
||||
val internalPath = new Path(dataPath, "XGBoostRegressionModel")
|
||||
val outputStream = internalPath.getFileSystem(sc.hadoopConfiguration).create(internalPath)
|
||||
instance._booster.saveModel(outputStream)
|
||||
outputStream.close()
|
||||
}
|
||||
}
|
||||
|
||||
private class XGBoostRegressionModelReader extends MLReader[XGBoostRegressionModel] {
|
||||
|
||||
/** Checked against metadata when loading model */
|
||||
private val className = classOf[XGBoostRegressionModel].getName
|
||||
|
||||
override def load(path: String): XGBoostRegressionModel = {
|
||||
implicit val sc = super.sparkSession.sparkContext
|
||||
|
||||
val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc, className)
|
||||
|
||||
val dataPath = new Path(path, "data").toString
|
||||
val internalPath = new Path(dataPath, "XGBoostRegressionModel")
|
||||
val dataInStream = internalPath.getFileSystem(sc.hadoopConfiguration).open(internalPath)
|
||||
|
||||
val booster = SXGBoost.loadModel(dataInStream)
|
||||
val model = new XGBoostRegressionModel(metadata.uid, booster)
|
||||
DefaultXGBoostParamsReader.getAndSetParams(model, metadata)
|
||||
model
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,39 +20,40 @@ import scala.collection.immutable.HashSet
|
||||
|
||||
import org.apache.spark.ml.param.{DoubleParam, IntParam, Param, Params}
|
||||
|
||||
private[spark] trait BoosterParams extends Params {
|
||||
trait BoosterParams extends Params {
|
||||
|
||||
/**
|
||||
* Booster to use, options: {'gbtree', 'gblinear', 'dart'}
|
||||
*/
|
||||
val boosterType = new Param[String](this, "booster",
|
||||
s"Booster to use, options: {'gbtree', 'gblinear', 'dart'}",
|
||||
(value: String) => BoosterParams.supportedBoosters.contains(value.toLowerCase))
|
||||
|
||||
/**
|
||||
* step size shrinkage used in update to prevents overfitting. After each boosting step, we
|
||||
* can directly get the weights of new features and eta actually shrinks the feature weights
|
||||
* to make the boosting process more conservative. [default=0.3] range: [0,1]
|
||||
*/
|
||||
final val eta = new DoubleParam(this, "eta", "step size shrinkage used in update to prevents" +
|
||||
val eta = new DoubleParam(this, "eta", "step size shrinkage used in update to prevents" +
|
||||
" overfitting. After each boosting step, we can directly get the weights of new features." +
|
||||
" and eta actually shrinks the feature weights to make the boosting process more conservative.",
|
||||
(value: Double) => value >= 0 && value <= 1)
|
||||
|
||||
final def getEta: Double = $(eta)
|
||||
|
||||
/**
|
||||
* minimum loss reduction required to make a further partition on a leaf node of the tree.
|
||||
* the larger, the more conservative the algorithm will be. [default=0] range: [0,
|
||||
* Double.MaxValue]
|
||||
*/
|
||||
final val gamma = new DoubleParam(this, "gamma", "minimum loss reduction required to make a " +
|
||||
"further partition on a leaf node of the tree. the larger, the more conservative the " +
|
||||
"algorithm will be.", (value: Double) => value >= 0)
|
||||
|
||||
final def getGamma: Double = $(gamma)
|
||||
val gamma = new DoubleParam(this, "gamma", "minimum loss reduction required to make a further" +
|
||||
" partition on a leaf node of the tree. the larger, the more conservative the algorithm" +
|
||||
" will be.", (value: Double) => value >= 0)
|
||||
|
||||
/**
|
||||
* maximum depth of a tree, increase this value will make model more complex / likely to be
|
||||
* overfitting. [default=6] range: [1, Int.MaxValue]
|
||||
*/
|
||||
final val maxDepth = new IntParam(this, "maxDepth", "maximum depth of a tree, increase this " +
|
||||
"value will make model more complex/likely to be overfitting.", (value: Int) => value >= 1)
|
||||
|
||||
final def getMaxDepth: Int = $(maxDepth)
|
||||
val maxDepth = new IntParam(this, "max_depth", "maximum depth of a tree, increase this value" +
|
||||
" will make model more complex/likely to be overfitting.", (value: Int) => value >= 1)
|
||||
|
||||
/**
|
||||
* minimum sum of instance weight(hessian) needed in a child. If the tree partition step results
|
||||
@@ -61,15 +62,13 @@ private[spark] trait BoosterParams extends Params {
|
||||
* to minimum number of instances needed to be in each node. The larger, the more conservative
|
||||
* the algorithm will be. [default=1] range: [0, Double.MaxValue]
|
||||
*/
|
||||
final val minChildWeight = new DoubleParam(this, "minChildWeight", "minimum sum of instance" +
|
||||
val minChildWeight = new DoubleParam(this, "min_child_weight", "minimum sum of instance" +
|
||||
" weight(hessian) needed in a child. If the tree partition step results in a leaf node with" +
|
||||
" the sum of instance weight less than min_child_weight, then the building process will" +
|
||||
" give up further partitioning. In linear regression mode, this simply corresponds to minimum" +
|
||||
" number of instances needed to be in each node. The larger, the more conservative" +
|
||||
" the algorithm will be.", (value: Double) => value >= 0)
|
||||
|
||||
final def getMinChildWeight: Double = $(minChildWeight)
|
||||
|
||||
/**
|
||||
* Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it
|
||||
* means there is no constraint. If it is set to a positive value, it can help making the update
|
||||
@@ -77,113 +76,90 @@ private[spark] trait BoosterParams extends Params {
|
||||
* regression when class is extremely imbalanced. Set it to value of 1-10 might help control the
|
||||
* update. [default=0] range: [0, Double.MaxValue]
|
||||
*/
|
||||
final val maxDeltaStep = new DoubleParam(this, "maxDeltaStep", "Maximum delta step we allow " +
|
||||
"each tree's weight" +
|
||||
val maxDeltaStep = new DoubleParam(this, "max_delta_step", "Maximum delta step we allow each" +
|
||||
" tree's weight" +
|
||||
" estimation to be. If the value is set to 0, it means there is no constraint. If it is set" +
|
||||
" to a positive value, it can help making the update step more conservative. Usually this" +
|
||||
" parameter is not needed, but it might help in logistic regression when class is extremely" +
|
||||
" imbalanced. Set it to value of 1-10 might help control the update",
|
||||
(value: Double) => value >= 0)
|
||||
|
||||
final def getMaxDeltaStep: Double = $(maxDeltaStep)
|
||||
|
||||
/**
|
||||
* subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly
|
||||
* collected half of the data instances to grow trees and this will prevent overfitting.
|
||||
* [default=1] range:(0,1]
|
||||
*/
|
||||
final val subsample = new DoubleParam(this, "subsample", "subsample ratio of the training " +
|
||||
"instance. Setting it to 0.5 means that XGBoost randomly collected half of the data " +
|
||||
"instances to grow trees and this will prevent overfitting.",
|
||||
(value: Double) => value <= 1 && value > 0)
|
||||
|
||||
final def getSubsample: Double = $(subsample)
|
||||
val subSample = new DoubleParam(this, "subsample", "subsample ratio of the training instance." +
|
||||
" Setting it to 0.5 means that XGBoost randomly collected half of the data instances to" +
|
||||
" grow trees and this will prevent overfitting.", (value: Double) => value <= 1 && value > 0)
|
||||
|
||||
/**
|
||||
* subsample ratio of columns when constructing each tree. [default=1] range: (0,1]
|
||||
*/
|
||||
final val colsampleBytree = new DoubleParam(this, "colsampleBytree", "subsample ratio of " +
|
||||
"columns when constructing each tree.", (value: Double) => value <= 1 && value > 0)
|
||||
|
||||
final def getColsampleBytree: Double = $(colsampleBytree)
|
||||
val colSampleByTree = new DoubleParam(this, "colsample_bytree", "subsample ratio of columns" +
|
||||
" when constructing each tree.", (value: Double) => value <= 1 && value > 0)
|
||||
|
||||
/**
|
||||
* subsample ratio of columns for each split, in each level. [default=1] range: (0,1]
|
||||
*/
|
||||
final val colsampleBylevel = new DoubleParam(this, "colsampleBylevel", "subsample ratio of " +
|
||||
"columns for each split, in each level.", (value: Double) => value <= 1 && value > 0)
|
||||
|
||||
final def getColsampleBylevel: Double = $(colsampleBylevel)
|
||||
val colSampleByLevel = new DoubleParam(this, "colsample_bylevel", "subsample ratio of columns" +
|
||||
" for each split, in each level.", (value: Double) => value <= 1 && value > 0)
|
||||
|
||||
/**
|
||||
* L2 regularization term on weights, increase this value will make model more conservative.
|
||||
* [default=1]
|
||||
*/
|
||||
final val lambda = new DoubleParam(this, "lambda", "L2 regularization term on weights, " +
|
||||
"increase this value will make model more conservative.", (value: Double) => value >= 0)
|
||||
|
||||
final def getLambda: Double = $(lambda)
|
||||
val lambda = new DoubleParam(this, "lambda", "L2 regularization term on weights, increase this" +
|
||||
" value will make model more conservative.", (value: Double) => value >= 0)
|
||||
|
||||
/**
|
||||
* L1 regularization term on weights, increase this value will make model more conservative.
|
||||
* [default=0]
|
||||
*/
|
||||
final val alpha = new DoubleParam(this, "alpha", "L1 regularization term on weights, increase " +
|
||||
"this value will make model more conservative.", (value: Double) => value >= 0)
|
||||
|
||||
final def getAlpha: Double = $(alpha)
|
||||
val alpha = new DoubleParam(this, "alpha", "L1 regularization term on weights, increase this" +
|
||||
" value will make model more conservative.", (value: Double) => value >= 0)
|
||||
|
||||
/**
|
||||
* The tree construction algorithm used in XGBoost. options: {'auto', 'exact', 'approx'}
|
||||
* [default='auto']
|
||||
*/
|
||||
final val treeMethod = new Param[String](this, "treeMethod",
|
||||
val treeMethod = new Param[String](this, "tree_method",
|
||||
"The tree construction algorithm used in XGBoost, options: {'auto', 'exact', 'approx', 'hist'}",
|
||||
(value: String) => BoosterParams.supportedTreeMethods.contains(value))
|
||||
|
||||
final def getTreeMethod: String = $(treeMethod)
|
||||
|
||||
/**
|
||||
* growth policy for fast histogram algorithm
|
||||
*/
|
||||
final val growPolicy = new Param[String](this, "growPolicy",
|
||||
val growthPolicty = new Param[String](this, "grow_policy",
|
||||
"growth policy for fast histogram algorithm",
|
||||
(value: String) => BoosterParams.supportedGrowthPolicies.contains(value))
|
||||
|
||||
final def getGrowPolicy: String = $(growPolicy)
|
||||
|
||||
/**
|
||||
* maximum number of bins in histogram
|
||||
*/
|
||||
final val maxBins = new IntParam(this, "maxBin", "maximum number of bins in histogram",
|
||||
val maxBins = new IntParam(this, "max_bin", "maximum number of bins in histogram",
|
||||
(value: Int) => value > 0)
|
||||
|
||||
final def getMaxBins: Int = $(maxBins)
|
||||
|
||||
/**
|
||||
* This is only used for approximate greedy algorithm.
|
||||
* This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
|
||||
* number of bins, this comes with theoretical guarantee with sketch accuracy.
|
||||
* [default=0.03] range: (0, 1)
|
||||
*/
|
||||
final val sketchEps = new DoubleParam(this, "sketchEps",
|
||||
val sketchEps = new DoubleParam(this, "sketch_eps",
|
||||
"This is only used for approximate greedy algorithm. This roughly translated into" +
|
||||
" O(1 / sketch_eps) number of bins. Compared to directly select number of bins, this comes" +
|
||||
" with theoretical guarantee with sketch accuracy.",
|
||||
(value: Double) => value < 1 && value > 0)
|
||||
|
||||
final def getSketchEps: Double = $(sketchEps)
|
||||
|
||||
/**
|
||||
* Control the balance of positive and negative weights, useful for unbalanced classes. A typical
|
||||
* value to consider: sum(negative cases) / sum(positive cases). [default=1]
|
||||
*/
|
||||
final val scalePosWeight = new DoubleParam(this, "scalePosWeight", "Control the balance of " +
|
||||
"positive and negative weights, useful for unbalanced classes. A typical value to consider:" +
|
||||
val scalePosWeight = new DoubleParam(this, "scale_pos_weight", "Control the balance of positive" +
|
||||
" and negative weights, useful for unbalanced classes. A typical value to consider:" +
|
||||
" sum(negative cases) / sum(positive cases)")
|
||||
|
||||
final def getScalePosWeight: Double = $(scalePosWeight)
|
||||
|
||||
// Dart boosters
|
||||
|
||||
/**
|
||||
@@ -191,64 +167,72 @@ private[spark] trait BoosterParams extends Params {
|
||||
* Type of sampling algorithm. "uniform": dropped trees are selected uniformly.
|
||||
* "weighted": dropped trees are selected in proportion to weight. [default="uniform"]
|
||||
*/
|
||||
final val sampleType = new Param[String](this, "sampleType", "type of sampling algorithm, " +
|
||||
"options: {'uniform', 'weighted'}",
|
||||
val sampleType = new Param[String](this, "sample_type", "type of sampling algorithm, options:" +
|
||||
" {'uniform', 'weighted'}",
|
||||
(value: String) => BoosterParams.supportedSampleType.contains(value))
|
||||
|
||||
final def getSampleType: String = $(sampleType)
|
||||
|
||||
/**
|
||||
* Parameter of Dart booster.
|
||||
* type of normalization algorithm, options: {'tree', 'forest'}. [default="tree"]
|
||||
*/
|
||||
final val normalizeType = new Param[String](this, "normalizeType", "type of normalization" +
|
||||
val normalizeType = new Param[String](this, "normalize_type", "type of normalization" +
|
||||
" algorithm, options: {'tree', 'forest'}",
|
||||
(value: String) => BoosterParams.supportedNormalizeType.contains(value))
|
||||
|
||||
final def getNormalizeType: String = $(normalizeType)
|
||||
|
||||
/**
|
||||
* Parameter of Dart booster.
|
||||
* dropout rate. [default=0.0] range: [0.0, 1.0]
|
||||
*/
|
||||
final val rateDrop = new DoubleParam(this, "rateDrop", "dropout rate", (value: Double) =>
|
||||
val rateDrop = new DoubleParam(this, "rate_drop", "dropout rate", (value: Double) =>
|
||||
value >= 0 && value <= 1)
|
||||
|
||||
final def getRateDrop: Double = $(rateDrop)
|
||||
|
||||
/**
|
||||
* Parameter of Dart booster.
|
||||
* probability of skip dropout. If a dropout is skipped, new trees are added in the same manner
|
||||
* as gbtree. [default=0.0] range: [0.0, 1.0]
|
||||
*/
|
||||
final val skipDrop = new DoubleParam(this, "skipDrop", "probability of skip dropout. If" +
|
||||
val skipDrop = new DoubleParam(this, "skip_drop", "probability of skip dropout. If" +
|
||||
" a dropout is skipped, new trees are added in the same manner as gbtree.",
|
||||
(value: Double) => value >= 0 && value <= 1)
|
||||
|
||||
final def getSkipDrop: Double = $(skipDrop)
|
||||
|
||||
// linear booster
|
||||
/**
|
||||
* Parameter of linear booster
|
||||
* L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)
|
||||
*/
|
||||
final val lambdaBias = new DoubleParam(this, "lambdaBias", "L2 regularization term on bias, " +
|
||||
"default 0 (no L1 reg on bias because it is not important)", (value: Double) => value >= 0)
|
||||
val lambdaBias = new DoubleParam(this, "lambda_bias", "L2 regularization term on bias, default" +
|
||||
" 0 (no L1 reg on bias because it is not important)", (value: Double) => value >= 0)
|
||||
|
||||
final def getLambdaBias: Double = $(lambdaBias)
|
||||
|
||||
final val treeLimit = new IntParam(this, name = "treeLimit",
|
||||
doc = "number of trees used in the prediction; defaults to 0 (use all trees).")
|
||||
|
||||
final def getTreeLimit: Double = $(treeLimit)
|
||||
|
||||
setDefault(eta -> 0.3, gamma -> 0, maxDepth -> 6,
|
||||
setDefault(boosterType -> "gbtree", eta -> 0.3, gamma -> 0, maxDepth -> 6,
|
||||
minChildWeight -> 1, maxDeltaStep -> 0,
|
||||
growPolicy -> "depthwise", maxBins -> 16,
|
||||
subsample -> 1, colsampleBytree -> 1, colsampleBylevel -> 1,
|
||||
growthPolicty -> "depthwise", maxBins -> 16,
|
||||
subSample -> 1, colSampleByTree -> 1, colSampleByLevel -> 1,
|
||||
lambda -> 1, alpha -> 0, treeMethod -> "auto", sketchEps -> 0.03,
|
||||
scalePosWeight -> 1.0, sampleType -> "uniform", normalizeType -> "tree",
|
||||
rateDrop -> 0.0, skipDrop -> 0.0, lambdaBias -> 0, treeLimit -> 0)
|
||||
rateDrop -> 0.0, skipDrop -> 0.0, lambdaBias -> 0)
|
||||
|
||||
/**
|
||||
* Explains all params of this instance. See `explainParam()`.
|
||||
*/
|
||||
override def explainParams(): String = {
|
||||
// TODO: filter some parameters according to the booster type
|
||||
val boosterTypeStr = $(boosterType)
|
||||
val validParamList = {
|
||||
if (boosterTypeStr == "gblinear") {
|
||||
// gblinear
|
||||
params.filter(param => param.name == "lambda" ||
|
||||
param.name == "alpha" || param.name == "lambda_bias")
|
||||
} else if (boosterTypeStr != "dart") {
|
||||
// gbtree
|
||||
params.filter(param => param.name != "sample_type" &&
|
||||
param.name != "normalize_type" && param.name != "rate_drop" && param.name != "skip_drop")
|
||||
} else {
|
||||
// dart
|
||||
params.filter(_.name != "lambda_bias")
|
||||
}
|
||||
}
|
||||
explainParam(boosterType) + "\n" ++ validParamList.map(explainParam).mkString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object BoosterParams {
|
||||
|
||||
@@ -16,105 +16,84 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark.params
|
||||
|
||||
import com.google.common.base.CaseFormat
|
||||
import ml.dmlc.xgboost4j.scala.spark.TrackerConf
|
||||
|
||||
import org.apache.spark.ml.param._
|
||||
import scala.collection.mutable
|
||||
|
||||
private[spark] trait GeneralParams extends Params {
|
||||
trait GeneralParams extends Params {
|
||||
|
||||
/**
|
||||
* The number of rounds for boosting
|
||||
*/
|
||||
final val numRound = new IntParam(this, "numRound", "The number of rounds for boosting",
|
||||
val round = new IntParam(this, "num_round", "The number of rounds for boosting",
|
||||
ParamValidators.gtEq(1))
|
||||
|
||||
final def getNumRound: Int = $(numRound)
|
||||
|
||||
/**
|
||||
* number of workers used to train xgboost model. default: 1
|
||||
*/
|
||||
final val numWorkers = new IntParam(this, "numWorkers", "number of workers used to run xgboost",
|
||||
val nWorkers = new IntParam(this, "nworkers", "number of workers used to run xgboost",
|
||||
ParamValidators.gtEq(1))
|
||||
|
||||
final def getNumWorkers: Int = $(numWorkers)
|
||||
|
||||
/**
|
||||
* number of threads used by per worker. default 1
|
||||
*/
|
||||
final val nthread = new IntParam(this, "nthread", "number of threads used by per worker",
|
||||
val numThreadPerTask = new IntParam(this, "nthread", "number of threads used by per worker",
|
||||
ParamValidators.gtEq(1))
|
||||
|
||||
final def getNthread: Int = $(nthread)
|
||||
|
||||
/**
|
||||
* whether to use external memory as cache. default: false
|
||||
*/
|
||||
final val useExternalMemory = new BooleanParam(this, "useExternalMemory",
|
||||
"whether to use external memory as cache")
|
||||
|
||||
final def getUseExternalMemory: Boolean = $(useExternalMemory)
|
||||
val useExternalMemory = new BooleanParam(this, "use_external_memory", "whether to use external" +
|
||||
"memory as cache")
|
||||
|
||||
/**
|
||||
* 0 means printing running messages, 1 means silent mode. default: 0
|
||||
*/
|
||||
final val silent = new IntParam(this, "silent",
|
||||
val silent = new IntParam(this, "silent",
|
||||
"0 means printing running messages, 1 means silent mode.",
|
||||
(value: Int) => value >= 0 && value <= 1)
|
||||
|
||||
final def getSilent: Int = $(silent)
|
||||
|
||||
/**
|
||||
* customized objective function provided by user. default: null
|
||||
*/
|
||||
final val customObj = new CustomObjParam(this, "customObj", "customized objective function " +
|
||||
val customObj = new CustomObjParam(this, "custom_obj", "customized objective function " +
|
||||
"provided by user")
|
||||
|
||||
/**
|
||||
* customized evaluation function provided by user. default: null
|
||||
*/
|
||||
final val customEval = new CustomEvalParam(this, "customEval",
|
||||
"customized evaluation function provided by user")
|
||||
val customEval = new CustomEvalParam(this, "custom_eval", "customized evaluation function " +
|
||||
"provided by user")
|
||||
|
||||
/**
|
||||
* the value treated as missing. default: Float.NaN
|
||||
*/
|
||||
final val missing = new FloatParam(this, "missing", "the value treated as missing")
|
||||
|
||||
final def getMissing: Float = $(missing)
|
||||
val missing = new FloatParam(this, "missing", "the value treated as missing")
|
||||
|
||||
/**
|
||||
* the maximum time to wait for the job requesting new workers. default: 30 minutes
|
||||
*/
|
||||
final val timeoutRequestWorkers = new LongParam(this, "timeoutRequestWorkers", "the maximum " +
|
||||
"time to request new Workers if numCores are insufficient. The timeout will be disabled " +
|
||||
"if this value is set smaller than or equal to 0.")
|
||||
|
||||
final def getTimeoutRequestWorkers: Long = $(timeoutRequestWorkers)
|
||||
val timeoutRequestWorkers = new LongParam(this, "timeout_request_workers", "the maximum time to" +
|
||||
" request new Workers if numCores are insufficient. The timeout will be disabled if this" +
|
||||
" value is set smaller than or equal to 0.")
|
||||
|
||||
/**
|
||||
* The hdfs folder to load and save checkpoint boosters. default: `empty_string`
|
||||
*/
|
||||
final val checkpointPath = new Param[String](this, "checkpointPath", "the hdfs folder to load " +
|
||||
"and save checkpoints. If there are existing checkpoints in checkpoint_path. The job will " +
|
||||
"load the checkpoint with highest version as the starting point for training. If " +
|
||||
val checkpointPath = new Param[String](this, "checkpoint_path", "the hdfs folder to load and " +
|
||||
"save checkpoints. If there are existing checkpoints in checkpoint_path. The job will load " +
|
||||
"the checkpoint with highest version as the starting point for training. If " +
|
||||
"checkpoint_interval is also set, the job will save a checkpoint every a few rounds.")
|
||||
|
||||
final def getCheckpointPath: String = $(checkpointPath)
|
||||
|
||||
/**
|
||||
* Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that
|
||||
* the trained model will get checkpointed every 10 iterations. Note: `checkpoint_path` must
|
||||
* also be set if the checkpoint interval is greater than 0.
|
||||
*/
|
||||
final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval",
|
||||
"set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the trained " +
|
||||
"model will get checkpointed every 10 iterations. Note: `checkpoint_path` must also be " +
|
||||
"set if the checkpoint interval is greater than 0.",
|
||||
(interval: Int) => interval == -1 || interval >= 1)
|
||||
|
||||
final def getCheckpointInterval: Int = $(checkpointInterval)
|
||||
val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint " +
|
||||
"interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the trained model will get " +
|
||||
"checkpointed every 10 iterations. Note: `checkpoint_path` must also be set if the checkpoint" +
|
||||
" interval is greater than 0.", (interval: Int) => interval == -1 || interval >= 1)
|
||||
|
||||
/**
|
||||
* Rabit tracker configurations. The parameter must be provided as an instance of the
|
||||
@@ -143,117 +122,15 @@ private[spark] trait GeneralParams extends Params {
|
||||
* Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf).
|
||||
* Ignored if the tracker implementation is "python".
|
||||
*/
|
||||
final val trackerConf = new TrackerConfParam(this, "trackerConf", "Rabit tracker configurations")
|
||||
val trackerConf = new TrackerConfParam(this, "tracker_conf", "Rabit tracker configurations")
|
||||
|
||||
/** Random seed for the C++ part of XGBoost and train/test splitting. */
|
||||
final val seed = new LongParam(this, "seed", "random seed")
|
||||
val seed = new LongParam(this, "seed", "random seed")
|
||||
|
||||
final def getSeed: Long = $(seed)
|
||||
|
||||
setDefault(numRound -> 1, numWorkers -> 1, nthread -> 1,
|
||||
setDefault(round -> 1, nWorkers -> 1, numThreadPerTask -> 1,
|
||||
useExternalMemory -> false, silent -> 0,
|
||||
customObj -> null, customEval -> null, missing -> Float.NaN,
|
||||
trackerConf -> TrackerConf(), seed -> 0, timeoutRequestWorkers -> 30 * 60 * 1000L,
|
||||
checkpointPath -> "", checkpointInterval -> -1
|
||||
)
|
||||
}
|
||||
|
||||
trait HasLeafPredictionCol extends Params {
|
||||
/**
|
||||
* Param for leaf prediction column name.
|
||||
* @group param
|
||||
*/
|
||||
final val leafPredictionCol: Param[String] = new Param[String](this, "leafPredictionCol",
|
||||
"name of the predictLeaf results")
|
||||
|
||||
/** @group getParam */
|
||||
final def getLeafPredictionCol: String = $(leafPredictionCol)
|
||||
}
|
||||
|
||||
trait HasContribPredictionCol extends Params {
|
||||
/**
|
||||
* Param for contribution prediction column name.
|
||||
* @group param
|
||||
*/
|
||||
final val contribPredictionCol: Param[String] = new Param[String](this, "contribPredictionCol",
|
||||
"name of the predictContrib results")
|
||||
|
||||
/** @group getParam */
|
||||
final def getContribPredictionCol: String = $(contribPredictionCol)
|
||||
}
|
||||
|
||||
trait HasBaseMarginCol extends Params {
|
||||
|
||||
/**
|
||||
* Param for initial prediction (aka base margin) column name.
|
||||
* @group param
|
||||
*/
|
||||
final val baseMarginCol: Param[String] = new Param[String](this, "baseMarginCol",
|
||||
"Initial prediction (aka base margin) column name.")
|
||||
|
||||
/** @group getParam */
|
||||
final def getBaseMarginCol: String = $(baseMarginCol)
|
||||
}
|
||||
|
||||
trait HasGroupCol extends Params {
|
||||
|
||||
/**
|
||||
* Param for group column name.
|
||||
* @group param
|
||||
*/
|
||||
final val groupCol: Param[String] = new Param[String](this, "groupCol", "group column name.")
|
||||
|
||||
/** @group getParam */
|
||||
final def getGroupCol: String = $(groupCol)
|
||||
|
||||
}
|
||||
|
||||
trait HasNumClass extends Params {
|
||||
|
||||
/**
|
||||
* number of classes
|
||||
*/
|
||||
final val numClass = new IntParam(this, "numClass", "number of classes")
|
||||
|
||||
/** @group getParam */
|
||||
final def getNumClass: Int = $(numClass)
|
||||
}
|
||||
|
||||
private[spark] trait ParamMapFuncs extends Params {
|
||||
|
||||
def XGBoostToMLlibParams(xgboostParams: Map[String, Any]): Unit = {
|
||||
for ((paramName, paramValue) <- xgboostParams) {
|
||||
if ((paramName == "booster" && paramValue != "gbtree") ||
|
||||
(paramName == "updater" && paramValue != "grow_colmaker,prune")) {
|
||||
throw new IllegalArgumentException(s"you specified $paramName as $paramValue," +
|
||||
s" XGBoost-Spark only supports gbtree as booster type" +
|
||||
" and grow_colmaker,prune as the updater type")
|
||||
}
|
||||
val name = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, paramName)
|
||||
params.find(_.name == name) match {
|
||||
case None =>
|
||||
case Some(_: DoubleParam) =>
|
||||
set(name, paramValue.toString.toDouble)
|
||||
case Some(_: BooleanParam) =>
|
||||
set(name, paramValue.toString.toBoolean)
|
||||
case Some(_: IntParam) =>
|
||||
set(name, paramValue.toString.toInt)
|
||||
case Some(_: FloatParam) =>
|
||||
set(name, paramValue.toString.toFloat)
|
||||
case Some(_: Param[_]) =>
|
||||
set(name, paramValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def MLlib2XGBoostParams: Map[String, Any] = {
|
||||
val xgboostParams = new mutable.HashMap[String, Any]()
|
||||
for (param <- params) {
|
||||
if (isDefined(param)) {
|
||||
val name = CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, param.name)
|
||||
xgboostParams += name -> $(param)
|
||||
}
|
||||
}
|
||||
xgboostParams.toMap
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,69 +20,82 @@ import scala.collection.immutable.HashSet
|
||||
|
||||
import org.apache.spark.ml.param._
|
||||
|
||||
private[spark] trait LearningTaskParams extends Params {
|
||||
trait LearningTaskParams extends Params {
|
||||
|
||||
/**
|
||||
* number of tasks to learn
|
||||
*/
|
||||
val numClasses = new IntParam(this, "num_class", "number of classes")
|
||||
|
||||
/**
|
||||
* Specify the learning task and the corresponding learning objective.
|
||||
* options: reg:linear, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
|
||||
* multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:linear
|
||||
*/
|
||||
final val objective = new Param[String](this, "objective", "objective function used for " +
|
||||
s"training, options: {${LearningTaskParams.supportedObjective.mkString(",")}",
|
||||
val objective = new Param[String](this, "objective", "objective function used for training," +
|
||||
s" options: {${LearningTaskParams.supportedObjective.mkString(",")}",
|
||||
(value: String) => LearningTaskParams.supportedObjective.contains(value))
|
||||
|
||||
final def getObjective: String = $(objective)
|
||||
|
||||
/**
|
||||
* the initial prediction score of all instances, global bias. default=0.5
|
||||
*/
|
||||
final val baseScore = new DoubleParam(this, "baseScore", "the initial prediction score of all" +
|
||||
val baseScore = new DoubleParam(this, "base_score", "the initial prediction score of all" +
|
||||
" instances, global bias")
|
||||
|
||||
final def getBaseScore: Double = $(baseScore)
|
||||
|
||||
/**
|
||||
* evaluation metrics for validation data, a default metric will be assigned according to
|
||||
* objective(rmse for regression, and error for classification, mean average precision for
|
||||
* ranking). options: rmse, mae, logloss, error, merror, mlogloss, auc, aucpr, ndcg, map,
|
||||
* gamma-deviance
|
||||
*/
|
||||
final val evalMetric = new Param[String](this, "evalMetric", "evaluation metrics for " +
|
||||
"validation data, a default metric will be assigned according to objective " +
|
||||
"(rmse for regression, and error for classification, mean average precision for ranking), " +
|
||||
s"options: {${LearningTaskParams.supportedEvalMetrics.mkString(",")}}",
|
||||
val evalMetric = new Param[String](this, "eval_metric", "evaluation metrics for validation" +
|
||||
" data, a default metric will be assigned according to objective (rmse for regression, and" +
|
||||
" error for classification, mean average precision for ranking), options: " +
|
||||
s" {${LearningTaskParams.supportedEvalMetrics.mkString(",")}}",
|
||||
(value: String) => LearningTaskParams.supportedEvalMetrics.contains(value))
|
||||
|
||||
final def getEvalMetric: String = $(evalMetric)
|
||||
/**
|
||||
* group data specify each group sizes for ranking task. To correspond to partition of
|
||||
* training data, it is nested.
|
||||
*/
|
||||
val groupData = new GroupDataParam(this, "groupData", "group data specify each group size" +
|
||||
" for ranking task. To correspond to partition of training data, it is nested.")
|
||||
|
||||
/**
|
||||
* Initial prediction (aka base margin) column name.
|
||||
*/
|
||||
val baseMarginCol = new Param[String](this, "baseMarginCol", "base margin column name")
|
||||
|
||||
/**
|
||||
* Instance weights column name.
|
||||
*/
|
||||
val weightCol = new Param[String](this, "weightCol", "weight column name")
|
||||
|
||||
/**
|
||||
* Fraction of training points to use for testing.
|
||||
*/
|
||||
final val trainTestRatio = new DoubleParam(this, "trainTestRatio",
|
||||
val trainTestRatio = new DoubleParam(this, "trainTestRatio",
|
||||
"fraction of training points to use for testing",
|
||||
ParamValidators.inRange(0, 1))
|
||||
|
||||
final def getTrainTestRatio: Double = $(trainTestRatio)
|
||||
|
||||
/**
|
||||
* If non-zero, the training will be stopped after a specified number
|
||||
* of consecutive increases in any evaluation metric.
|
||||
*/
|
||||
final val numEarlyStoppingRounds = new IntParam(this, "numEarlyStoppingRounds",
|
||||
val numEarlyStoppingRounds = new IntParam(this, "numEarlyStoppingRounds",
|
||||
"number of rounds of decreasing eval metric to tolerate before " +
|
||||
"stopping the training",
|
||||
(value: Int) => value == 0 || value > 1)
|
||||
|
||||
final def getNumEarlyStoppingRounds: Int = $(numEarlyStoppingRounds)
|
||||
|
||||
setDefault(objective -> "reg:linear", baseScore -> 0.5,
|
||||
trainTestRatio -> 1.0, numEarlyStoppingRounds -> 0)
|
||||
setDefault(objective -> "reg:linear", baseScore -> 0.5, numClasses -> 2, groupData -> null,
|
||||
baseMarginCol -> "baseMargin", weightCol -> "weight", trainTestRatio -> 1.0,
|
||||
numEarlyStoppingRounds -> 0)
|
||||
}
|
||||
|
||||
private[spark] object LearningTaskParams {
|
||||
val supportedObjective = HashSet("reg:linear", "reg:logistic", "binary:logistic",
|
||||
"binary:logitraw", "count:poisson", "multi:softmax", "multi:softprob", "rank:pairwise",
|
||||
"reg:gamma", "reg:tweedie")
|
||||
"reg:gamma")
|
||||
|
||||
val supportedEvalMetrics = HashSet("rmse", "mae", "logloss", "error", "merror", "mlogloss",
|
||||
"auc", "aucpr", "ndcg", "map", "gamma-deviance")
|
||||
|
||||
@@ -33,14 +33,13 @@ import scala.concurrent.{Await, Future, TimeoutException}
|
||||
*
|
||||
* @param sc The SparkContext object
|
||||
* @param timeout The maximum time to wait for enough number of workers.
|
||||
* @param numWorkers nWorkers used in an XGBoost Job
|
||||
* @param nWorkers nWorkers used in an XGBoost Job
|
||||
*/
|
||||
class SparkParallelismTracker(
|
||||
val sc: SparkContext,
|
||||
timeout: Long,
|
||||
numWorkers: Int) {
|
||||
nWorkers: Int) {
|
||||
|
||||
private[this] val requestedCores = numWorkers * sc.conf.getInt("spark.task.cpus", 1)
|
||||
private[this] val mapper = new ObjectMapper()
|
||||
private[this] val logger = LogFactory.getLog("XGBoostSpark")
|
||||
private[this] val url = sc.uiWebUrl match {
|
||||
@@ -77,12 +76,12 @@ class SparkParallelismTracker(
|
||||
}
|
||||
|
||||
private[this] def safeExecute[T](body: => T): T = {
|
||||
val listener = new TaskFailedListener
|
||||
val listener = new TaskFailedListener;
|
||||
sc.addSparkListener(listener)
|
||||
try {
|
||||
body
|
||||
} finally {
|
||||
sc.removeSparkListener(listener)
|
||||
sc.listenerBus.removeListener(listener)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,11 +99,10 @@ class SparkParallelismTracker(
|
||||
body
|
||||
} else {
|
||||
try {
|
||||
waitForCondition(numAliveCores >= requestedCores, timeout)
|
||||
waitForCondition(numAliveCores >= nWorkers, timeout)
|
||||
} catch {
|
||||
case _: TimeoutException =>
|
||||
throw new IllegalStateException(s"Unable to get $requestedCores workers for" +
|
||||
s" XGBoost training")
|
||||
throw new IllegalStateException(s"Unable to get $nWorkers workers for XGBoost training")
|
||||
}
|
||||
safeExecute(body)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
0 1:985.574005058 2:320.223538037 3:0.621236086198
|
||||
0 1:1010.52917943 2:635.535543082 3:2.14984030531
|
||||
0 1:1012.91900422 2:132.387300057 3:0.488761066665
|
||||
0 1:990.829194034 2:135.102081162 3:0.747701610673
|
||||
0 1:1007.05103629 2:154.289183562 3:0.464118249201
|
||||
0 1:994.9573036 2:317.483732878 3:0.0313685555674
|
||||
0 1:987.8071541 2:731.349178363 3:0.244616944245
|
||||
1 1:10.0349544469 2:2.29750906143 3:36.4949974282
|
||||
0 1:9.92953881383 2:5.39134047297 3:120.041297548
|
||||
0 1:10.0909866713 2:9.06191026312 3:138.807825798
|
||||
1 1:10.2090970614 2:0.0784495944448 3:58.207703565
|
||||
0 1:9.85695905893 2:9.99500727713 3:56.8610243778
|
||||
1 1:10.0805758547 2:0.0410805760559 3:222.102302076
|
||||
0 1:10.1209914486 2:9.9729127088 3:171.888238763
|
||||
0 1:10.0331939798 2:0.853339303793 3:311.181328375
|
||||
0 1:9.93901762951 2:2.72757449146 3:78.4859514413
|
||||
0 1:10.0752365346 2:9.18695328235 3:49.8520256553
|
||||
1 1:10.0456548902 2:0.270936043122 3:123.462958597
|
||||
0 1:10.0568923673 2:0.82997113263 3:44.9391426001
|
||||
0 1:9.8214143472 2:0.277538931578 3:15.4217659578
|
||||
0 1:9.95258604431 2:8.69564346094 3:255.513470671
|
||||
0 1:9.91934976357 2:7.72809741413 3:82.171591817
|
||||
0 1:10.043239582 2:8.64168255553 3:38.9657919329
|
||||
1 1:10.0236147929 2:0.0496662263659 3:4.40889812286
|
||||
1 1:1001.85585324 2:3.75646886071 3:0.0179224994842
|
||||
0 1:1014.25578571 2:0.285765311201 3:0.510329864983
|
||||
1 1:1002.81422786 2:9.77676280375 3:0.433705951912
|
||||
1 1:998.072711553 2:2.82100686538 3:0.889829076909
|
||||
0 1:1003.77395036 2:2.55916592114 3:0.0359402151496
|
||||
1 1:10.0807877782 2:4.98513959013 3:47.5266363559
|
||||
0 1:10.0015013081 2:9.94302478763 3:78.3697486277
|
||||
1 1:10.0441936789 2:0.305091816635 3:56.8213984987
|
||||
0 1:9.94257106618 2:7.23909568913 3:442.463339039
|
||||
1 1:9.86479307916 2:6.41701315844 3:55.1365304834
|
||||
0 1:10.0428628516 2:9.98466447697 3:0.391632812588
|
||||
0 1:9.94445884566 2:9.99970945878 3:260.438436534
|
||||
1 1:9.84641392823 2:225.78051312 3:1.00525978847
|
||||
1 1:9.86907690608 2:26.8971083147 3:0.577959255991
|
||||
0 1:10.0177314626 2:0.110585342313 3:2.30545043031
|
||||
0 1:10.0688190907 2:412.023866234 3:1.22421542264
|
||||
0 1:10.1251769646 2:13.8212202925 3:0.129171734504
|
||||
0 1:10.0840758802 2:407.359097187 3:0.477000870705
|
||||
0 1:10.1007458705 2:987.183625145 3:0.149385677415
|
||||
0 1:9.86472656059 2:169.559640615 3:0.147221652519
|
||||
0 1:9.94207419238 2:507.290053755 3:0.41996207214
|
||||
0 1:9.9671005502 2:1.62610457716 3:0.408173666788
|
||||
0 1:1010.57126596 2:9.06673707562 3:0.672092284372
|
||||
0 1:1001.6718262 2:9.53203990055 3:4.7364050044
|
||||
0 1:995.777341384 2:4.43847316256 3:2.07229073634
|
||||
0 1:1002.95701386 2:5.51711016665 3:1.24294450546
|
||||
0 1:1016.0988238 2:0.626468941906 3:0.105627919134
|
||||
0 1:1013.67571419 2:0.042315529666 3:0.717619310322
|
||||
1 1:994.747747892 2:6.01989364024 3:0.772910130015
|
||||
1 1:991.654593872 2:7.35575736952 3:1.19822091548
|
||||
0 1:1008.47101732 2:8.28240754909 3:0.229582481359
|
||||
0 1:1000.81975227 2:1.52448354056 3:0.096441660362
|
||||
0 1:10.0900922344 2:322.656649307 3:57.8149073088
|
||||
1 1:10.0868337371 2:2.88652339174 3:54.8865514572
|
||||
0 1:10.0988984137 2:979.483832657 3:52.6809830901
|
||||
0 1:9.97678959238 2:665.770979738 3:481.069628909
|
||||
0 1:9.78554312773 2:257.309358658 3:47.7324475232
|
||||
0 1:10.0985967566 2:935.896512941 3:138.937052808
|
||||
0 1:10.0522252319 2:876.376299607 3:6.00373510669
|
||||
1 1:9.88065229501 2:9.99979825653 3:0.0674603696149
|
||||
0 1:10.0483244098 2:0.0653852316381 3:0.130679349938
|
||||
1 1:9.99685215607 2:1.76602542774 3:0.2551321159
|
||||
0 1:9.99750159428 2:1.01591534436 3:0.145445506504
|
||||
1 1:9.97380908941 2:0.940048645571 3:0.411805696316
|
||||
0 1:9.99977678382 2:6.91329929641 3:5.57858201258
|
||||
0 1:978.876096381 2:933.775364741 3:0.579170824236
|
||||
0 1:998.381016406 2:220.940470582 3:2.01491778565
|
||||
0 1:987.917644594 2:8.74667873567 3:0.364006099758
|
||||
0 1:1000.20994892 2:25.2945450565 3:3.5684398964
|
||||
0 1:1014.57141264 2:675.593540733 3:0.164174055535
|
||||
0 1:998.867283535 2:765.452750642 3:0.818425293238
|
||||
@@ -0,0 +1,10 @@
|
||||
7
|
||||
7
|
||||
10
|
||||
5
|
||||
7
|
||||
10
|
||||
10
|
||||
7
|
||||
6
|
||||
6
|
||||
@@ -0,0 +1,74 @@
|
||||
0 1:10.2143092481 2:273.576539531 3:137.111774354
|
||||
0 1:10.0366658918 2:842.469052609 3:2.32134375927
|
||||
0 1:10.1281202091 2:395.654057342 3:35.4184893063
|
||||
0 1:10.1443721289 2:960.058461049 3:272.887070637
|
||||
0 1:10.1353234784 2:535.51304462 3:2.15393842032
|
||||
1 1:10.0451640374 2:216.733858424 3:55.6533298016
|
||||
1 1:9.94254592171 2:44.5985537358 3:304.614176871
|
||||
0 1:10.1319257181 2:613.545504487 3:5.42391587912
|
||||
0 1:1020.63622468 2:997.476744201 3:0.509425590461
|
||||
0 1:986.304585519 2:822.669937965 3:0.605133561808
|
||||
1 1:1012.66863221 2:26.7185759069 3:0.0875458784828
|
||||
0 1:995.387656321 2:81.8540176995 3:0.691999430068
|
||||
0 1:1020.6587198 2:848.826964547 3:0.540159430526
|
||||
1 1:1003.81573853 2:379.84350931 3:0.0083682925194
|
||||
0 1:1021.60921516 2:641.376951467 3:1.12339054807
|
||||
0 1:1000.17585041 2:122.107138713 3:1.09906375372
|
||||
1 1:987.64802348 2:5.98448541152 3:0.124241987204
|
||||
1 1:9.94610136583 2:346.114985897 3:0.387708236565
|
||||
0 1:9.96812192337 2:313.278109696 3:0.00863026595671
|
||||
0 1:10.0181739194 2:36.7378924562 3:2.92179879835
|
||||
0 1:9.89000102695 2:164.273723971 3:0.685222591968
|
||||
0 1:10.1555212436 2:320.451459462 3:2.01341536261
|
||||
0 1:10.0085727613 2:999.767117646 3:0.462294934168
|
||||
1 1:9.93099658724 2:5.17478203909 3:0.213855205032
|
||||
0 1:10.0629454957 2:663.088181857 3:0.049022351462
|
||||
0 1:10.1109732417 2:734.904569784 3:1.6998450094
|
||||
0 1:1006.6015266 2:505.023453703 3:1.90870566777
|
||||
0 1:991.865769489 2:245.437343115 3:0.475109744256
|
||||
0 1:998.682734072 2:950.041057232 3:1.9256314201
|
||||
0 1:1005.02207209 2:2.9619314197 3:0.0517146822357
|
||||
0 1:1002.54526214 2:860.562681899 3:0.915687092848
|
||||
0 1:1000.38847359 2:808.416525088 3:0.209690673808
|
||||
1 1:992.557818382 2:373.889409453 3:0.107571728577
|
||||
0 1:1002.07722137 2:997.329626371 3:1.06504260496
|
||||
0 1:1000.40504333 2:949.832139189 3:0.539159980327
|
||||
0 1:10.1460179902 2:8.86082969819 3:135.953842715
|
||||
1 1:9.98529296553 2:2.87366448495 3:1.74249892194
|
||||
0 1:9.88942676744 2:9.4031821056 3:149.473066381
|
||||
1 1:10.0192953341 2:1.99685737576 3:1.79502473397
|
||||
0 1:10.0110654379 2:8.13112593726 3:87.7765628103
|
||||
0 1:997.148677047 2:733.936190093 3:1.49298494242
|
||||
0 1:1008.70465919 2:957.121652078 3:0.217414013634
|
||||
1 1:997.356154278 2:541.599587807 3:0.100855972216
|
||||
0 1:999.615897283 2:943.700501824 3:0.862874175879
|
||||
1 1:997.36859077 2:0.200859940848 3:0.13601892182
|
||||
0 1:10.0423255624 2:1.73855202168 3:0.956695338485
|
||||
1 1:9.88440755486 2:9.9994600678 3:0.305080529665
|
||||
0 1:10.0891026412 2:3.28031719474 3:0.364450973697
|
||||
0 1:9.90078644258 2:8.77839663617 3:0.456660574479
|
||||
1 1:9.79380029711 2:8.77220326156 3:0.527292005175
|
||||
0 1:9.93613887011 2:9.76270841268 3:1.40865693823
|
||||
0 1:10.0009239007 2:7.29056178263 3:0.498015866607
|
||||
0 1:9.96603319905 2:5.12498000925 3:0.517492532783
|
||||
0 1:10.0923827222 2:2.76652583955 3:1.56571226159
|
||||
1 1:10.0983782035 2:587.788120694 3:0.031756483687
|
||||
1 1:9.91397225464 2:994.527496819 3:3.72092164978
|
||||
0 1:10.1057472738 2:2.92894440088 3:0.683506438532
|
||||
0 1:10.1014053354 2:959.082038017 3:1.07039624129
|
||||
0 1:10.1433253044 2:322.515119317 3:0.51408278993
|
||||
1 1:9.82832510699 2:637.104433908 3:0.250272776427
|
||||
0 1:1000.49729075 2:2.75336888111 3:0.576634423274
|
||||
1 1:984.90338088 2:0.0295435794035 3:1.26273339929
|
||||
0 1:1001.53811442 2:4.64164410861 3:0.0293389959504
|
||||
1 1:995.875898395 2:5.08223403205 3:0.382330566779
|
||||
0 1:996.405937252 2:6.26395190757 3:0.453645816611
|
||||
0 1:10.0165140779 2:340.126072514 3:0.220794603312
|
||||
0 1:9.93482824816 2:951.672000448 3:0.124406293612
|
||||
0 1:10.1700278554 2:0.0140985961008 3:0.252452256311
|
||||
0 1:9.99825079542 2:950.382643896 3:0.875382402062
|
||||
0 1:9.87316410028 2:686.788257829 3:0.215886999825
|
||||
0 1:10.2893240654 2:89.3947931451 3:0.569578232133
|
||||
0 1:9.98689192703 2:0.430107535413 3:2.99869831728
|
||||
0 1:10.1365175107 2:972.279245093 3:0.0865099386744
|
||||
0 1:9.90744703306 2:50.810461183 3:3.00863325197
|
||||
@@ -0,0 +1,10 @@
|
||||
8
|
||||
9
|
||||
9
|
||||
9
|
||||
5
|
||||
5
|
||||
9
|
||||
6
|
||||
5
|
||||
9
|
||||
@@ -0,0 +1,10 @@
|
||||
7
|
||||
5
|
||||
9
|
||||
6
|
||||
6
|
||||
8
|
||||
7
|
||||
6
|
||||
5
|
||||
7
|
||||
@@ -1,66 +0,0 @@
|
||||
0,10.0229017899,7.30178495562,0.118115020017,1
|
||||
0,9.93639621859,9.93102159291,0.0435030004396,1
|
||||
0,10.1301737265,0.00411765220572,2.4165878053,1
|
||||
1,9.87828587087,0.608588414992,0.111262590883,1
|
||||
0,10.1373430048,0.47764012225,0.991553052194,1
|
||||
0,10.0523814718,4.72152505167,0.672978832666,1
|
||||
0,10.0449715742,8.40373928536,0.384457573667,1
|
||||
1,996.398498791,941.976309154,0.230269231292,2
|
||||
0,1005.11269468,900.093680877,0.265031528873,2
|
||||
0,997.160349441,891.331101688,2.19362017313,2
|
||||
0,993.754139031,44.8000165317,1.03868009875,2
|
||||
1,994.831299184,241.959208453,0.667631827024,2
|
||||
0,995.948333283,7.94326917112,0.750490877118,3
|
||||
0,989.733981273,7.52077625436,0.0126335967282,3
|
||||
0,1003.54086516,6.48177510564,1.19441696788,3
|
||||
0,996.56177804,9.71959812613,1.33082465111,3
|
||||
0,1005.61382467,0.234339369309,1.17987797356,3
|
||||
1,980.215758708,6.85554542926,2.63965085259,3
|
||||
1,987.776408872,2.23354609991,0.841885278028,3
|
||||
0,1006.54260396,8.12142049834,2.26639471174,3
|
||||
0,1009.87927639,6.40028519044,0.775155669615,3
|
||||
0,9.95006244393,928.76896718,234.948458244,4
|
||||
1,10.0749152258,255.294574476,62.9728604166,4
|
||||
1,10.1916541988,312.682867085,92.299413677,4
|
||||
0,9.95646724484,742.263188416,53.3310473654,4
|
||||
0,9.86211293222,996.237023866,2.00760301168,4
|
||||
1,9.91801019468,303.971783709,50.3147230679,4
|
||||
0,996.983996934,9.52188222766,1.33588120981,5
|
||||
0,995.704388126,9.49260524915,0.908498516541,5
|
||||
0,987.86480767,0.0870786716821,0.108859297837,5
|
||||
0,1000.99561307,2.85272694575,0.171134518956,5
|
||||
0,1011.05508066,7.55336771768,1.04950084825,5
|
||||
1,985.52199365,0.763305780608,1.7402424375,5
|
||||
0,10.0430321467,813.185427181,4.97728254185,6
|
||||
0,10.0812334228,258.297288417,0.127477670549,6
|
||||
0,9.84210504292,887.205815261,0.991689193955,6
|
||||
1,9.94625332613,0.298622762132,0.147881353231,6
|
||||
0,9.97800659954,727.619819757,0.0718361141866,6
|
||||
1,9.8037938472,957.385549617,0.0618862028941,6
|
||||
0,10.0880634741,185.024638577,1.7028095095,6
|
||||
0,9.98630799154,109.10631473,0.681117359751,6
|
||||
0,9.91671416638,166.248076588,122.538291094,7
|
||||
0,10.1206910464,88.1539468531,141.189859069,7
|
||||
1,10.1767160518,1.02960996847,172.02256237,7
|
||||
0,9.93025147233,391.196641942,58.040338247,7
|
||||
0,9.84850936037,474.63346537,17.5627875397,7
|
||||
1,9.8162731343,61.9199554213,30.6740972851,7
|
||||
0,10.0403482984,987.50416929,73.0472906209,7
|
||||
1,997.019228359,133.294717663,0.0572254083186,8
|
||||
0,973.303999107,1.79080888849,0.100478717048,8
|
||||
0,1008.28808825,342.282350685,0.409806485495,8
|
||||
0,1014.55621524,0.680510407082,0.929530602495,8
|
||||
1,1012.74370325,823.105266455,0.0894693730585,8
|
||||
0,1003.63554038,727.334432075,0.58206275756,8
|
||||
0,10.1560432436,740.35938307,11.6823378533,9
|
||||
0,9.83949099701,512.828227154,138.206666681,9
|
||||
1,10.1837395682,179.287126088,185.479062365,9
|
||||
1,9.9761881495,12.1093388336,9.1264604171,9
|
||||
1,9.77402180766,318.561317743,80.6005221355,9
|
||||
0,1011.15705381,0.215825852155,1.34429667906,10
|
||||
0,1005.60353229,727.202346126,1.47146041005,10
|
||||
1,1013.93702961,58.7312725205,0.421041560754,10
|
||||
0,1004.86813074,757.693204258,0.566055205344,10
|
||||
0,999.996324692,813.12386828,0.864428279513,10
|
||||
0,996.55255931,918.760056995,0.43365051974,10
|
||||
1,1004.1394132,464.371823646,0.312492288321,10
|
||||
|
@@ -1,149 +0,0 @@
|
||||
0,985.574005058,320.223538037,0.621236086198,1
|
||||
0,1010.52917943,635.535543082,2.14984030531,1
|
||||
0,1012.91900422,132.387300057,0.488761066665,1
|
||||
0,990.829194034,135.102081162,0.747701610673,1
|
||||
0,1007.05103629,154.289183562,0.464118249201,1
|
||||
0,994.9573036,317.483732878,0.0313685555674,1
|
||||
0,987.8071541,731.349178363,0.244616944245,1
|
||||
1,10.0349544469,2.29750906143,36.4949974282,2
|
||||
0,9.92953881383,5.39134047297,120.041297548,2
|
||||
0,10.0909866713,9.06191026312,138.807825798,2
|
||||
1,10.2090970614,0.0784495944448,58.207703565,2
|
||||
0,9.85695905893,9.99500727713,56.8610243778,2
|
||||
1,10.0805758547,0.0410805760559,222.102302076,2
|
||||
0,10.1209914486,9.9729127088,171.888238763,2
|
||||
0,10.0331939798,0.853339303793,311.181328375,3
|
||||
0,9.93901762951,2.72757449146,78.4859514413,3
|
||||
0,10.0752365346,9.18695328235,49.8520256553,3
|
||||
1,10.0456548902,0.270936043122,123.462958597,3
|
||||
0,10.0568923673,0.82997113263,44.9391426001,3
|
||||
0,9.8214143472,0.277538931578,15.4217659578,3
|
||||
0,9.95258604431,8.69564346094,255.513470671,3
|
||||
0,9.91934976357,7.72809741413,82.171591817,3
|
||||
0,10.043239582,8.64168255553,38.9657919329,3
|
||||
1,10.0236147929,0.0496662263659,4.40889812286,3
|
||||
1,1001.85585324,3.75646886071,0.0179224994842,4
|
||||
0,1014.25578571,0.285765311201,0.510329864983,4
|
||||
1,1002.81422786,9.77676280375,0.433705951912,4
|
||||
1,998.072711553,2.82100686538,0.889829076909,4
|
||||
0,1003.77395036,2.55916592114,0.0359402151496,4
|
||||
1,10.0807877782,4.98513959013,47.5266363559,5
|
||||
0,10.0015013081,9.94302478763,78.3697486277,5
|
||||
1,10.0441936789,0.305091816635,56.8213984987,5
|
||||
0,9.94257106618,7.23909568913,442.463339039,5
|
||||
1,9.86479307916,6.41701315844,55.1365304834,5
|
||||
0,10.0428628516,9.98466447697,0.391632812588,5
|
||||
0,9.94445884566,9.99970945878,260.438436534,5
|
||||
1,9.84641392823,225.78051312,1.00525978847,6
|
||||
1,9.86907690608,26.8971083147,0.577959255991,6
|
||||
0,10.0177314626,0.110585342313,2.30545043031,6
|
||||
0,10.0688190907,412.023866234,1.22421542264,6
|
||||
0,10.1251769646,13.8212202925,0.129171734504,6
|
||||
0,10.0840758802,407.359097187,0.477000870705,6
|
||||
0,10.1007458705,987.183625145,0.149385677415,6
|
||||
0,9.86472656059,169.559640615,0.147221652519,6
|
||||
0,9.94207419238,507.290053755,0.41996207214,6
|
||||
0,9.9671005502,1.62610457716,0.408173666788,6
|
||||
0,1010.57126596,9.06673707562,0.672092284372,7
|
||||
0,1001.6718262,9.53203990055,4.7364050044,7
|
||||
0,995.777341384,4.43847316256,2.07229073634,7
|
||||
0,1002.95701386,5.51711016665,1.24294450546,7
|
||||
0,1016.0988238,0.626468941906,0.105627919134,7
|
||||
0,1013.67571419,0.042315529666,0.717619310322,7
|
||||
1,994.747747892,6.01989364024,0.772910130015,7
|
||||
1,991.654593872,7.35575736952,1.19822091548,7
|
||||
0,1008.47101732,8.28240754909,0.229582481359,7
|
||||
0,1000.81975227,1.52448354056,0.096441660362,7
|
||||
0,10.0900922344,322.656649307,57.8149073088,8
|
||||
1,10.0868337371,2.88652339174,54.8865514572,8
|
||||
0,10.0988984137,979.483832657,52.6809830901,8
|
||||
0,9.97678959238,665.770979738,481.069628909,8
|
||||
0,9.78554312773,257.309358658,47.7324475232,8
|
||||
0,10.0985967566,935.896512941,138.937052808,8
|
||||
0,10.0522252319,876.376299607,6.00373510669,8
|
||||
1,9.88065229501,9.99979825653,0.0674603696149,9
|
||||
0,10.0483244098,0.0653852316381,0.130679349938,9
|
||||
1,9.99685215607,1.76602542774,0.2551321159,9
|
||||
0,9.99750159428,1.01591534436,0.145445506504,9
|
||||
1,9.97380908941,0.940048645571,0.411805696316,9
|
||||
0,9.99977678382,6.91329929641,5.57858201258,9
|
||||
0,978.876096381,933.775364741,0.579170824236,10
|
||||
0,998.381016406,220.940470582,2.01491778565,10
|
||||
0,987.917644594,8.74667873567,0.364006099758,10
|
||||
0,1000.20994892,25.2945450565,3.5684398964,10
|
||||
0,1014.57141264,675.593540733,0.164174055535,10
|
||||
0,998.867283535,765.452750642,0.818425293238,10
|
||||
0,10.2143092481,273.576539531,137.111774354,11
|
||||
0,10.0366658918,842.469052609,2.32134375927,11
|
||||
0,10.1281202091,395.654057342,35.4184893063,11
|
||||
0,10.1443721289,960.058461049,272.887070637,11
|
||||
0,10.1353234784,535.51304462,2.15393842032,11
|
||||
1,10.0451640374,216.733858424,55.6533298016,11
|
||||
1,9.94254592171,44.5985537358,304.614176871,11
|
||||
0,10.1319257181,613.545504487,5.42391587912,11
|
||||
0,1020.63622468,997.476744201,0.509425590461,12
|
||||
0,986.304585519,822.669937965,0.605133561808,12
|
||||
1,1012.66863221,26.7185759069,0.0875458784828,12
|
||||
0,995.387656321,81.8540176995,0.691999430068,12
|
||||
0,1020.6587198,848.826964547,0.540159430526,12
|
||||
1,1003.81573853,379.84350931,0.0083682925194,12
|
||||
0,1021.60921516,641.376951467,1.12339054807,12
|
||||
0,1000.17585041,122.107138713,1.09906375372,12
|
||||
1,987.64802348,5.98448541152,0.124241987204,12
|
||||
1,9.94610136583,346.114985897,0.387708236565,13
|
||||
0,9.96812192337,313.278109696,0.00863026595671,13
|
||||
0,10.0181739194,36.7378924562,2.92179879835,13
|
||||
0,9.89000102695,164.273723971,0.685222591968,13
|
||||
0,10.1555212436,320.451459462,2.01341536261,13
|
||||
0,10.0085727613,999.767117646,0.462294934168,13
|
||||
1,9.93099658724,5.17478203909,0.213855205032,13
|
||||
0,10.0629454957,663.088181857,0.049022351462,13
|
||||
0,10.1109732417,734.904569784,1.6998450094,13
|
||||
0,1006.6015266,505.023453703,1.90870566777,14
|
||||
0,991.865769489,245.437343115,0.475109744256,14
|
||||
0,998.682734072,950.041057232,1.9256314201,14
|
||||
0,1005.02207209,2.9619314197,0.0517146822357,14
|
||||
0,1002.54526214,860.562681899,0.915687092848,14
|
||||
0,1000.38847359,808.416525088,0.209690673808,14
|
||||
1,992.557818382,373.889409453,0.107571728577,14
|
||||
0,1002.07722137,997.329626371,1.06504260496,14
|
||||
0,1000.40504333,949.832139189,0.539159980327,14
|
||||
0,10.1460179902,8.86082969819,135.953842715,15
|
||||
1,9.98529296553,2.87366448495,1.74249892194,15
|
||||
0,9.88942676744,9.4031821056,149.473066381,15
|
||||
1,10.0192953341,1.99685737576,1.79502473397,15
|
||||
0,10.0110654379,8.13112593726,87.7765628103,15
|
||||
0,997.148677047,733.936190093,1.49298494242,16
|
||||
0,1008.70465919,957.121652078,0.217414013634,16
|
||||
1,997.356154278,541.599587807,0.100855972216,16
|
||||
0,999.615897283,943.700501824,0.862874175879,16
|
||||
1,997.36859077,0.200859940848,0.13601892182,16
|
||||
0,10.0423255624,1.73855202168,0.956695338485,17
|
||||
1,9.88440755486,9.9994600678,0.305080529665,17
|
||||
0,10.0891026412,3.28031719474,0.364450973697,17
|
||||
0,9.90078644258,8.77839663617,0.456660574479,17
|
||||
1,9.79380029711,8.77220326156,0.527292005175,17
|
||||
0,9.93613887011,9.76270841268,1.40865693823,17
|
||||
0,10.0009239007,7.29056178263,0.498015866607,17
|
||||
0,9.96603319905,5.12498000925,0.517492532783,17
|
||||
0,10.0923827222,2.76652583955,1.56571226159,17
|
||||
1,10.0983782035,587.788120694,0.031756483687,18
|
||||
1,9.91397225464,994.527496819,3.72092164978,18
|
||||
0,10.1057472738,2.92894440088,0.683506438532,18
|
||||
0,10.1014053354,959.082038017,1.07039624129,18
|
||||
0,10.1433253044,322.515119317,0.51408278993,18
|
||||
1,9.82832510699,637.104433908,0.250272776427,18
|
||||
0,1000.49729075,2.75336888111,0.576634423274,19
|
||||
1,984.90338088,0.0295435794035,1.26273339929,19
|
||||
0,1001.53811442,4.64164410861,0.0293389959504,19
|
||||
1,995.875898395,5.08223403205,0.382330566779,19
|
||||
0,996.405937252,6.26395190757,0.453645816611,19
|
||||
0,10.0165140779,340.126072514,0.220794603312,20
|
||||
0,9.93482824816,951.672000448,0.124406293612,20
|
||||
0,10.1700278554,0.0140985961008,0.252452256311,20
|
||||
0,9.99825079542,950.382643896,0.875382402062,20
|
||||
0,9.87316410028,686.788257829,0.215886999825,20
|
||||
0,10.2893240654,89.3947931451,0.569578232133,20
|
||||
0,9.98689192703,0.430107535413,2.99869831728,20
|
||||
0,10.1365175107,972.279245093,0.0865099386744,20
|
||||
0,9.90744703306,50.810461183,3.00863325197,20
|
||||
|
@@ -21,27 +21,37 @@ import java.nio.file.Files
|
||||
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
class CheckpointManagerSuite extends FunSuite with PerTest with BeforeAndAfterAll {
|
||||
class CheckpointManagerSuite extends FunSuite with BeforeAndAfterAll {
|
||||
var sc: SparkContext = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
.setMaster("local[*]")
|
||||
.setAppName("XGBoostSuite")
|
||||
sc = new SparkContext(conf)
|
||||
}
|
||||
|
||||
private lazy val (model4, model8) = {
|
||||
val training = buildDataFrame(Classification.train)
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_workers" -> sc.defaultParallelism)
|
||||
(new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
|
||||
new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
|
||||
"objective" -> "binary:logistic")
|
||||
(XGBoost.trainWithRDD(trainingRDD, paramMap, round = 2, nWorkers = sc.defaultParallelism),
|
||||
XGBoost.trainWithRDD(trainingRDD, paramMap, round = 4, nWorkers = sc.defaultParallelism))
|
||||
}
|
||||
|
||||
test("test update/load models") {
|
||||
val tmpPath = Files.createTempDirectory("test").toAbsolutePath.toString
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
manager.updateCheckpoint(model4._booster)
|
||||
manager.updateCheckpoint(model4)
|
||||
var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "4.model")
|
||||
assert(manager.loadCheckpointAsBooster.booster.getVersion == 4)
|
||||
|
||||
manager.updateCheckpoint(model8._booster)
|
||||
manager.updateCheckpoint(model8)
|
||||
files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "8.model")
|
||||
@@ -51,7 +61,7 @@ class CheckpointManagerSuite extends FunSuite with PerTest with BeforeAndAfterAl
|
||||
test("test cleanUpHigherVersions") {
|
||||
val tmpPath = Files.createTempDirectory("test").toAbsolutePath.toString
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
manager.updateCheckpoint(model8._booster)
|
||||
manager.updateCheckpoint(model8)
|
||||
manager.cleanUpHigherVersions(round = 8)
|
||||
assert(new File(s"$tmpPath/8.model").exists())
|
||||
|
||||
@@ -64,8 +74,7 @@ class CheckpointManagerSuite extends FunSuite with PerTest with BeforeAndAfterAl
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
assertResult(Seq(7))(manager.getCheckpointRounds(checkpointInterval = 0, round = 7))
|
||||
assertResult(Seq(2, 4, 6, 7))(manager.getCheckpointRounds(checkpointInterval = 2, round = 7))
|
||||
manager.updateCheckpoint(model4._booster)
|
||||
manager.updateCheckpoint(model4)
|
||||
assertResult(Seq(4, 6, 7))(manager.getCheckpointRounds(2, 7))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,14 +18,11 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.File
|
||||
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.{BeforeAndAfterEach, FunSuite}
|
||||
|
||||
trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
|
||||
|
||||
protected val numWorkers: Int = Runtime.getRuntime.availableProcessors()
|
||||
|
||||
@transient private var currentSession: SparkSession = _
|
||||
@@ -38,7 +35,6 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
|
||||
.appName("XGBoostSuite")
|
||||
.config("spark.ui.enabled", false)
|
||||
.config("spark.driver.memory", "512m")
|
||||
.config("spark.task.cpus", 1)
|
||||
|
||||
override def beforeEach(): Unit = getOrCreateSession
|
||||
|
||||
@@ -66,30 +62,4 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
|
||||
file.delete()
|
||||
}
|
||||
}
|
||||
|
||||
protected def buildDataFrame(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features")
|
||||
}
|
||||
|
||||
protected def buildDataFrameWithGroup(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features, labeledPoint.group)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features", "group")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,167 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.{File, FileNotFoundException}
|
||||
import java.util.Arrays
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
|
||||
import scala.util.Random
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.network.util.JavaUtils
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class PersistenceSuite extends FunSuite with PerTest with BeforeAndAfterAll {
|
||||
|
||||
private var tempDir: File = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
|
||||
tempDir = new File(System.getProperty("java.io.tmpdir"), this.getClass.getName)
|
||||
if (tempDir.exists) {
|
||||
tempDir.delete
|
||||
}
|
||||
tempDir.mkdirs
|
||||
}
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
JavaUtils.deleteRecursively(tempDir)
|
||||
super.afterAll()
|
||||
}
|
||||
|
||||
private def delete(f: File) {
|
||||
if (f.exists) {
|
||||
if (f.isDirectory) {
|
||||
for (c <- f.listFiles) {
|
||||
delete(c)
|
||||
}
|
||||
}
|
||||
if (!f.delete) {
|
||||
throw new FileNotFoundException("Failed to delete file: " + f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
|
||||
val eval = new EvalError()
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers)
|
||||
val xgbc = new XGBoostClassifier(paramMap)
|
||||
val xgbcPath = new File(tempDir, "xgbc").getPath
|
||||
xgbc.write.overwrite().save(xgbcPath)
|
||||
val xgbc2 = XGBoostClassifier.load(xgbcPath)
|
||||
val paramMap2 = xgbc2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
val model = xgbc.fit(trainingDF)
|
||||
val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults < 0.1)
|
||||
val xgbcModelPath = new File(tempDir, "xgbcModel").getPath
|
||||
model.write.overwrite.save(xgbcModelPath)
|
||||
val model2 = XGBoostClassificationModel.load(xgbcModelPath)
|
||||
assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
|
||||
|
||||
assert(model.getEta === model2.getEta)
|
||||
assert(model.getNumRound === model2.getNumRound)
|
||||
assert(model.getRawPredictionCol === model2.getRawPredictionCol)
|
||||
val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults === evalResults2)
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostRegressor and XGBoostRegressionModel") {
|
||||
val eval = new EvalError()
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDM = new DMatrix(Regression.test.iterator)
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> "10", "num_workers" -> numWorkers)
|
||||
val xgbr = new XGBoostRegressor(paramMap)
|
||||
val xgbrPath = new File(tempDir, "xgbr").getPath
|
||||
xgbr.write.overwrite().save(xgbrPath)
|
||||
val xgbr2 = XGBoostRegressor.load(xgbrPath)
|
||||
val paramMap2 = xgbr2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
val model = xgbr.fit(trainingDF)
|
||||
val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults < 0.1)
|
||||
val xgbrModelPath = new File(tempDir, "xgbrModel").getPath
|
||||
model.write.overwrite.save(xgbrModelPath)
|
||||
val model2 = XGBoostRegressionModel.load(xgbrModelPath)
|
||||
assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
|
||||
|
||||
assert(model.getEta === model2.getEta)
|
||||
assert(model.getNumRound === model2.getNumRound)
|
||||
assert(model.getPredictionCol === model2.getPredictionCol)
|
||||
val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults === evalResults2)
|
||||
}
|
||||
|
||||
test("test persistence of MLlib pipeline with XGBoostClassificationModel") {
|
||||
|
||||
val r = new Random(0)
|
||||
// maybe move to shared context, but requires session to import implicits
|
||||
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
|
||||
toDF("feature", "label")
|
||||
|
||||
val assembler = new VectorAssembler()
|
||||
.setInputCols(df.columns.filter(!_.contains("label")))
|
||||
.setOutputCol("features")
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
|
||||
// Construct MLlib pipeline, save and load
|
||||
val pipeline = new Pipeline().setStages(Array(assembler, xgb))
|
||||
val pipePath = new File(tempDir, "pipeline").getPath
|
||||
pipeline.write.overwrite().save(pipePath)
|
||||
val pipeline2 = Pipeline.read.load(pipePath)
|
||||
val xgb2 = pipeline2.getStages(1).asInstanceOf[XGBoostClassifier]
|
||||
val paramMap2 = xgb2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
// Model training, save and load
|
||||
val pipeModel = pipeline.fit(df)
|
||||
val pipeModelPath = new File(tempDir, "pipelineModel").getPath
|
||||
pipeModel.write.overwrite.save(pipeModelPath)
|
||||
val pipeModel2 = PipelineModel.load(pipeModelPath)
|
||||
|
||||
val xgbModel = pipeModel.stages(1).asInstanceOf[XGBoostClassificationModel]
|
||||
val xgbModel2 = pipeModel2.stages(1).asInstanceOf[XGBoostClassificationModel]
|
||||
|
||||
assert(Arrays.equals(xgbModel._booster.toByteArray, xgbModel2._booster.toByteArray))
|
||||
|
||||
assert(xgbModel.getEta === xgbModel2.getEta)
|
||||
assert(xgbModel.getNumRound === xgbModel2.getNumRound)
|
||||
assert(xgbModel.getRawPredictionCol === xgbModel2.getRawPredictionCol)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.io.Source
|
||||
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
trait TrainTestData {
|
||||
@@ -48,17 +48,6 @@ trait TrainTestData {
|
||||
XGBLabeledPoint(label, null, values)
|
||||
}.toList
|
||||
}
|
||||
|
||||
protected def getLabeledPointsWithGroup(resource: String): Seq[XGBLabeledPoint] = {
|
||||
getResourceLines(resource).map { line =>
|
||||
val original = line.split(",")
|
||||
val length = original.length
|
||||
val label = original.head.toFloat
|
||||
val group = original.last.toInt
|
||||
val values = original.slice(1, length - 1).map(_.toFloat)
|
||||
XGBLabeledPoint(label, null, values, 1f, group, Float.NaN)
|
||||
}.toList
|
||||
}
|
||||
}
|
||||
|
||||
object Classification extends TrainTestData {
|
||||
@@ -91,8 +80,11 @@ object Regression extends TrainTestData {
|
||||
}
|
||||
|
||||
object Ranking extends TrainTestData {
|
||||
val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank.test.txt", zeroBased = false)
|
||||
val train0: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false)
|
||||
val train1: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false)
|
||||
val trainGroup0: Seq[Int] = getGroups("/rank-demo-0.txt.train.group")
|
||||
val trainGroup1: Seq[Int] = getGroups("/rank-demo-1.txt.train.group")
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false)
|
||||
|
||||
private def getGroups(resource: String): Seq[Int] = {
|
||||
getResourceLines(resource).map(_.toInt).toList
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg._
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
|
||||
test("XGBoost-Spark XGBoostClassifier ouput should match XGBoost4j") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostClassifier(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probability"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction2.size)
|
||||
// the vector length in probability column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction1.indices) {
|
||||
assert(prediction1(i).length === prediction2(i).values.length - 1)
|
||||
for (j <- prediction1(i).indices) {
|
||||
assert(prediction1(i)(j) === prediction2(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
val prediction3 = model1.predict(testDM, outPutMargin = true)
|
||||
val prediction4 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction4.size)
|
||||
for (i <- prediction3.indices) {
|
||||
assert(prediction3(i).length === prediction4(i).values.length)
|
||||
for (j <- prediction3(i).indices) {
|
||||
assert(prediction3(i)(j) === prediction4(i)(j))
|
||||
}
|
||||
}
|
||||
|
||||
// check the equality of single instance prediction
|
||||
val firstOfDM = testDM.slice(Array(0))
|
||||
val firstOfDF = testDF.head().getAs[Vector]("features")
|
||||
val prediction5 = math.round(model1.predict(firstOfDM)(0)(0))
|
||||
val prediction6 = model2.predict(firstOfDF)
|
||||
assert(prediction5 === prediction6)
|
||||
}
|
||||
|
||||
test("Set params in XGBoost and MLlib way should produce same model") {
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"num_round" -> round,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
val model1 = new XGBoostClassifier(paramMap).fit(trainingDF)
|
||||
// Set params in MLlib way
|
||||
val model2 = new XGBoostClassifier()
|
||||
.setEta(1)
|
||||
.setMaxDepth(6)
|
||||
.setSilent(1)
|
||||
.setObjective("binary:logistic")
|
||||
.setNumRound(round)
|
||||
.setNumWorkers(numWorkers)
|
||||
.fit(trainingDF)
|
||||
|
||||
val prediction1 = model1.transform(testDF).select("prediction").collect()
|
||||
val prediction2 = model2.transform(testDF).select("prediction").collect()
|
||||
|
||||
prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
|
||||
assert(p1 === p2)
|
||||
}
|
||||
}
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
|
||||
val model = new XGBoostClassifier(paramMap).fit(trainingDF)
|
||||
|
||||
model.setRawPredictionCol("raw_prediction")
|
||||
.setProbabilityCol("probability_prediction")
|
||||
.setPredictionCol("final_prediction")
|
||||
var predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("probability_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
model.setRawPredictionCol("").setPredictionCol("final_prediction")
|
||||
predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("raw_prediction") === false)
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
model.setRawPredictionCol("raw_prediction").setPredictionCol("")
|
||||
predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("XGBoost and Spark parameters synchronize correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
// from xgboost params to spark params
|
||||
val xgb = new XGBoostClassifier(xgbParamMap)
|
||||
assert(xgb.getEta === 1.0)
|
||||
assert(xgb.getObjective === "binary:logistic")
|
||||
// from spark to xgboost params
|
||||
val xgbCopy = xgb.copy(ParamMap.empty)
|
||||
assert(xgbCopy.MLlib2XGBoostParams("eta").toString.toDouble === 1.0)
|
||||
assert(xgbCopy.MLlib2XGBoostParams("objective").toString === "binary:logistic")
|
||||
val xgbCopy2 = xgb.copy(ParamMap.empty.put(xgb.evalMetric, "logloss"))
|
||||
assert(xgbCopy2.MLlib2XGBoostParams("eval_metric").toString === "logloss")
|
||||
}
|
||||
|
||||
test("multi class classification") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
|
||||
"num_workers" -> numWorkers)
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
assert(model.getEta == 0.1)
|
||||
assert(model.getMaxDepth == 6)
|
||||
assert(model.numClasses == 6)
|
||||
}
|
||||
|
||||
test("use base margin") {
|
||||
val training1 = buildDataFrame(Classification.train)
|
||||
val training2 = training1.withColumn("margin", functions.rand())
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "test_train_split" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model1 = xgb.fit(training1)
|
||||
val model2 = xgb.setBaseMarginCol("margin").fit(training2)
|
||||
val prediction1 = model1.transform(test).select(model1.getProbabilityCol)
|
||||
.collect().map(row => row.getAs[Vector](0))
|
||||
val prediction2 = model2.transform(test).select(model2.getProbabilityCol)
|
||||
.collect().map(row => row.getAs[Vector](0))
|
||||
var count = 0
|
||||
for ((r1, r2) <- prediction1.zip(prediction2)) {
|
||||
if (!r1.equals(r2)) count = count + 1
|
||||
}
|
||||
assert(count != 0)
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
|
||||
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
|
||||
assert(testObjectiveHistory.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
|
||||
}
|
||||
|
||||
test("test predictionLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("predictLeaf")
|
||||
val resultDF = model.transform(test)
|
||||
assert(resultDF.count == groundTruth)
|
||||
assert(resultDF.columns.contains("predictLeaf"))
|
||||
}
|
||||
|
||||
test("test predictionLeaf with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("")
|
||||
val resultDF = model.transform(test)
|
||||
assert(!resultDF.columns.contains("predictLeaf"))
|
||||
}
|
||||
|
||||
test("test predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setContribPredictionCol("predictContrib")
|
||||
val resultDF = model.transform(buildDataFrame(Classification.test))
|
||||
assert(resultDF.count == groundTruth)
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
test("test predictionContrib with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setContribPredictionCol("")
|
||||
val resultDF = model.transform(test)
|
||||
assert(!resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
test("test predictionLeaf and predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("predictLeaf")
|
||||
model.setContribPredictionCol("predictContrib")
|
||||
val resultDF = model.transform(buildDataFrame(Classification.test))
|
||||
assert(resultDF.count == groundTruth)
|
||||
assert(resultDF.columns.contains("predictLeaf"))
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
}
|
||||
@@ -17,34 +17,36 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
|
||||
import org.apache.spark.sql._
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostConfigureSuite extends FunSuite with PerTest {
|
||||
|
||||
override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.config("spark.kryo.classesToRegister", classOf[Booster].getName)
|
||||
|
||||
test("nthread configuration must be no larger than spark.task.cpus") {
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_workers" -> numWorkers,
|
||||
"objective" -> "binary:logistic",
|
||||
"nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1))
|
||||
intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training)
|
||||
XGBoost.trainWithRDD(sc.parallelize(List()), paramMap, 5, numWorkers)
|
||||
}
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
import DataUtils._
|
||||
// TODO write an isolated test for Booster.
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator, null)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator, null)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val eval = new EvalError()
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.DenseVector
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.DataTypes
|
||||
import org.scalatest.FunSuite
|
||||
import org.scalatest.prop.TableDrivenPropertyChecks
|
||||
|
||||
class XGBoostDFSuite extends FunSuite with PerTest with TableDrivenPropertyChecks {
|
||||
private def buildDataFrame(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features")
|
||||
}
|
||||
|
||||
test("test consistency and order preservation of dataframe-based model") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingItr = Classification.train.iterator
|
||||
val testItr = Classification.test.iterator
|
||||
val round = 5
|
||||
val trainDMatrix = new DMatrix(trainingItr)
|
||||
val testDMatrix = new DMatrix(testItr)
|
||||
val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, round)
|
||||
val predResultFromSeq = xgboostModel.predict(testDMatrix)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = round, nWorkers = numWorkers)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probabilities"))).toMap
|
||||
assert(testDF.count() === predResultsFromDF.size)
|
||||
// the vector length in probabilties column is 2 since we have to fit to the evaluator in
|
||||
// Spark
|
||||
for (i <- predResultFromSeq.indices) {
|
||||
assert(predResultFromSeq(i).length === predResultsFromDF(i).values.length - 1)
|
||||
for (j <- predResultFromSeq(i).indices) {
|
||||
assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j + 1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test transformLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
xgBoostModelWithDF.transformLeaf(testDF).show()
|
||||
}
|
||||
|
||||
test("test schema of XGBoostRegressionModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.setPredictionCol("final_prediction")
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
predictionDF.show()
|
||||
}
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol(
|
||||
"raw_prediction").setPredictionCol("final_prediction")
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
var predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("").
|
||||
setPredictionCol("final_prediction")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction") === false)
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].
|
||||
setRawPredictionCol("raw_prediction").setPredictionCol("")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
}
|
||||
|
||||
test("xgboost and spark parameters synchronize correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
// from xgboost params to spark params
|
||||
val xgbEstimator = new XGBoostEstimator(xgbParamMap)
|
||||
assert(xgbEstimator.get(xgbEstimator.eta).get === 1.0)
|
||||
assert(xgbEstimator.get(xgbEstimator.objective).get === "binary:logistic")
|
||||
// from spark to xgboost params
|
||||
val xgbEstimatorCopy = xgbEstimator.copy(ParamMap.empty)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eta").toString.toDouble === 1.0)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("objective").toString === "binary:logistic")
|
||||
}
|
||||
|
||||
test("eval_metric is configured correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
val xgbEstimator = new XGBoostEstimator(xgbParamMap)
|
||||
assert(xgbEstimator.get(xgbEstimator.evalMetric).get === "error")
|
||||
val sparkParamMap = ParamMap.empty
|
||||
val xgbEstimatorCopy = xgbEstimator.copy(sparkParamMap)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eval_metric") === "error")
|
||||
val xgbEstimatorCopy1 = xgbEstimator.copy(sparkParamMap.put(xgbEstimator.evalMetric, "logloss"))
|
||||
assert(xgbEstimatorCopy1.fromParamsToXGBParamMap("eval_metric") === "logloss")
|
||||
}
|
||||
|
||||
ignore("fast histogram algorithm parameters are exposed correctly") {
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error")
|
||||
val testItr = Classification.test.iterator
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 10, nWorkers = math.min(2, numWorkers))
|
||||
val error = new EvalError
|
||||
val testSetDMatrix = new DMatrix(testItr)
|
||||
assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
test("multi_class classification test") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
XGBoost.trainWithDataFrame(trainingDF.toDF(), paramMap, round = 5, nWorkers = numWorkers)
|
||||
}
|
||||
|
||||
test("test DF use nested groupData") {
|
||||
val trainingDF = buildDataFrame(Ranking.train0, 1)
|
||||
.union(buildDataFrame(Ranking.train1, 1))
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = 2)
|
||||
val testDF = buildDataFrame(Ranking.test)
|
||||
val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("features"))).toMap
|
||||
assert(testDF.count() === predResultsFromDF.size)
|
||||
}
|
||||
|
||||
test("params of estimator and produced model are coordinated correctly") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
val model = XGBoost.trainWithDataFrame(trainingDF, paramMap, round = 5, nWorkers = numWorkers)
|
||||
assert(model.get[Double](model.eta).get == 0.1)
|
||||
assert(model.get[Int](model.maxDepth).get == 6)
|
||||
assert(model.asInstanceOf[XGBoostClassificationModel].numOfClasses == 6)
|
||||
}
|
||||
|
||||
test("test use base margin") {
|
||||
import DataUtils._
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
val trainingDfWithMargin = trainingDf.withColumn("margin", functions.rand())
|
||||
val testRDD = sc.parallelize(Classification.test.map(_.features))
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "baseMarginCol" -> "margin",
|
||||
"testTrainSplit" -> 0.5)
|
||||
|
||||
def trainPredict(df: Dataset[_]): Array[Float] = {
|
||||
XGBoost.trainWithDataFrame(df, paramMap, round = 1, nWorkers = numWorkers)
|
||||
.predict(testRDD)
|
||||
.map { case Array(p) => p }
|
||||
.collect()
|
||||
}
|
||||
|
||||
val pred = trainPredict(trainingDf)
|
||||
val predWithMargin = trainPredict(trainingDfWithMargin)
|
||||
assert((pred, predWithMargin).zipped.exists { case (p, pwm) => p !== pwm })
|
||||
}
|
||||
|
||||
test("test use weight") {
|
||||
import DataUtils._
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "weightCol" -> "weight")
|
||||
|
||||
val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f}, DataTypes.FloatType)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
.withColumn("weight", getWeightFromId(col("id")))
|
||||
|
||||
val model = XGBoost.trainWithDataFrame(trainingDF, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = true)
|
||||
.setPredictionCol("final_prediction")
|
||||
.setExternalMemory(true)
|
||||
val testRDD = sc.parallelize(Regression.test.map(_.features))
|
||||
val predictions = model.predict(testRDD).collect().flatten
|
||||
|
||||
// The predictions heavily relies on the first training instance, and thus are very close.
|
||||
predictions.foreach(pred => assert(math.abs(pred - predictions.head) <= 0.01f))
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
val model = XGBoost.trainWithDataFrame(trainingDf, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "trainTestRatio" -> "0.5")
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
|
||||
forAll(Table("useExternalMemory", false, true)) { useExternalMemory =>
|
||||
val model = XGBoost.trainWithDataFrame(trainingDf, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = useExternalMemory)
|
||||
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
|
||||
assert(testObjectiveHistory.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,18 +18,19 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.nio.file.Files
|
||||
import java.util.concurrent.LinkedBlockingDeque
|
||||
|
||||
import scala.util.Random
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, Vectors, Vector => SparkVector}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.scalatest.FunSuite
|
||||
import scala.util.Random
|
||||
|
||||
class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
|
||||
val vectorLength = 100
|
||||
val rdd = sc.parallelize(
|
||||
@@ -86,153 +87,283 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("training with external memory cache") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"use_external_memory" -> true)
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = true)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
|
||||
test("training with Scala-implemented Rabit tracker") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala")).toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
|
||||
ignore("test with fast histo depthwise") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
|
||||
"eval_metric" -> "error", "num_round" -> 5, "num_workers" -> math.min(numWorkers, 2))
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "eval_metric" -> "error")
|
||||
// TODO: histogram algorithm seems to be very very sensitive to worker number
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
|
||||
"max_leaves" -> "8", "eval_metric" -> "error", "num_round" -> 5,
|
||||
"num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide with max bin") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
|
||||
"eval_metric" -> "error", "num_round" -> 5, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
|
||||
"eval_metric" -> "error", "num_round" -> 10, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth and max bin") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error", "num_round" -> 10, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
test("dense vectors containing missing value") {
|
||||
def buildDenseDataFrame(): DataFrame = {
|
||||
test("test with dense vectors containing missing value") {
|
||||
def buildDenseRDD(): RDD[MLLabeledPoint] = {
|
||||
val numRows = 100
|
||||
val numCols = 5
|
||||
|
||||
val data = (0 until numRows).map { x =>
|
||||
val label = Random.nextInt(2)
|
||||
val labeledPoints = (0 until numRows).map { _ =>
|
||||
val label = Random.nextDouble()
|
||||
val values = Array.tabulate[Double](numCols) { c =>
|
||||
if (c == numCols - 1) -0.1 else Random.nextDouble
|
||||
if (c == numCols - 1) -0.1 else Random.nextDouble()
|
||||
}
|
||||
|
||||
(label, Vectors.dense(values))
|
||||
MLLabeledPoint(label, Vectors.dense(values))
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(data.toList)).toDF("label", "features")
|
||||
sc.parallelize(labeledPoints)
|
||||
}
|
||||
|
||||
val denseDF = buildDenseDataFrame().repartition(4)
|
||||
val trainingRDD = buildDenseRDD().repartition(4)
|
||||
val testRDD = buildDenseRDD().repartition(4).map(_.features.asInstanceOf[DenseVector])
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "missing" -> -0.1f, "num_workers" -> numWorkers).toMap
|
||||
val model = new XGBoostClassifier(paramMap).fit(denseDF)
|
||||
model.transform(denseDF).collect()
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
|
||||
useExternalMemory = true)
|
||||
xgBoostModel.predict(testRDD, missingValue = -0.1f).collect()
|
||||
}
|
||||
|
||||
test("test consistency of prediction functions with RDD") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSet = Classification.test
|
||||
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
|
||||
val testCollection = testRDD.collect()
|
||||
for (i <- testSet.indices) {
|
||||
assert(testCollection(i).toDense.values.sameElements(testSet(i).features.toDense.values))
|
||||
}
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1 = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
|
||||
for (i <- predResult1.indices; j <- predResult1(i).indices) {
|
||||
assert(predResult1(i)(j) === predResult2(i)(j))
|
||||
}
|
||||
}
|
||||
|
||||
test("test eval functions with RDD") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers)
|
||||
// Nan Zhu: deprecate it for now
|
||||
// xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
|
||||
xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
|
||||
}
|
||||
|
||||
test("test prediction functionality with empty partition") {
|
||||
import DataUtils._
|
||||
def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = {
|
||||
sparkContext.getOrElse(sc).parallelize(List[SparkVector](), numWorkers)
|
||||
}
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testRDD = buildEmptyRDD()
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
println(xgBoostModel.predict(testRDD).collect().length === 0)
|
||||
}
|
||||
|
||||
test("test use groupData") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1).map(_.asML)
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0)
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "eval_metric" -> "ndcg", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 2, nWorkers = 1)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1: Array[Array[Float]] = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
|
||||
val avgMetric = xgBoostModel.eval(trainingRDD, "test", iter = 0, groupData = trainGroupData)
|
||||
assert(avgMetric contains "ndcg")
|
||||
// If the labels were lost ndcg comes back as 1.0
|
||||
assert(avgMetric.split('=')(1).toFloat < 1F)
|
||||
}
|
||||
|
||||
test("test use nested groupData") {
|
||||
import DataUtils._
|
||||
val trainingRDD0 = sc.parallelize(Ranking.train0, numSlices = 1)
|
||||
val trainingRDD1 = sc.parallelize(Ranking.train1, numSlices = 1)
|
||||
val trainingRDD = trainingRDD0.union(trainingRDD1).map(_.asML)
|
||||
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
|
||||
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, nWorkers = 2)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1: Array[Array[Float]] = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
}
|
||||
|
||||
test("training with spark parallelism checks disabled") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "timeout_request_workers" -> 0L,
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "timeout_request_workers" -> 0L).toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
test("isClassificationTask correctly classifies supported objectives") {
|
||||
import org.scalatest.prop.TableDrivenPropertyChecks._
|
||||
|
||||
val objectives = Table(
|
||||
("isClassificationTask", "params"),
|
||||
(true, Map("obj_type" -> "classification")),
|
||||
(false, Map("obj_type" -> "regression")),
|
||||
(false, Map("objective" -> "rank:ndcg")),
|
||||
(false, Map("objective" -> "rank:pairwise")),
|
||||
(false, Map("objective" -> "rank:map")),
|
||||
(false, Map("objective" -> "count:poisson")),
|
||||
(true, Map("objective" -> "binary:logistic")),
|
||||
(true, Map("objective" -> "binary:logitraw")),
|
||||
(true, Map("objective" -> "multi:softmax")),
|
||||
(true, Map("objective" -> "multi:softprob")),
|
||||
(false, Map("objective" -> "reg:linear")),
|
||||
(false, Map("objective" -> "reg:logistic")),
|
||||
(false, Map("objective" -> "reg:gamma")),
|
||||
(false, Map("objective" -> "reg:tweedie")))
|
||||
forAll (objectives) { (isClassificationTask: Boolean, params: Map[String, String]) =>
|
||||
assert(XGBoost.isClassificationTask(params) == isClassificationTask)
|
||||
}
|
||||
}
|
||||
|
||||
test("training with checkpoint boosters") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
|
||||
val tmpPath = Files.createTempDirectory("model1").toAbsolutePath.toString
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> 2, "silent" -> "1",
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> 2, "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
|
||||
"checkpoint_interval" -> 2, "num_workers" -> numWorkers)
|
||||
|
||||
val prevModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 5)).fit(training)
|
||||
def error(model: Booster): Float = eval.eval(
|
||||
model.predict(testDM, outPutMargin = true), testDM)
|
||||
"checkpoint_interval" -> 2).toMap
|
||||
val prevModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
def error(model: XGBoostModel): Float = eval.eval(
|
||||
model.booster.predict(testSetDMatrix, outPutMargin = true), testSetDMatrix)
|
||||
|
||||
// Check only one model is kept after training
|
||||
val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "8.model")
|
||||
val tmpModel = SXGBoost.loadModel(s"$tmpPath/8.model")
|
||||
val tmpModel = XGBoost.loadModelFromHadoopFile(s"$tmpPath/8.model")
|
||||
|
||||
// Train next model based on prev model
|
||||
val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
|
||||
assert(error(tmpModel) > error(prevModel._booster))
|
||||
assert(error(prevModel._booster) > error(nextModel._booster))
|
||||
assert(error(nextModel._booster) < 0.1)
|
||||
val nextModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 8,
|
||||
nWorkers = numWorkers)
|
||||
assert(error(tmpModel) > error(prevModel))
|
||||
assert(error(prevModel) > error(nextModel))
|
||||
assert(error(nextModel) < 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.nio.file.Files
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostModelSuite extends FunSuite with PerTest {
|
||||
test("test model consistency after save and load") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(evalResults < 0.1)
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
val loadedXGBooostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
val predicts = loadedXGBooostModel.booster.predict(testSetDMatrix, outPutMargin = true)
|
||||
val loadedEvalResults = eval.eval(predicts, testSetDMatrix)
|
||||
assert(loadedEvalResults == evalResults)
|
||||
}
|
||||
|
||||
test("test save and load of different types of models") {
|
||||
import DataUtils._
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
var trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
// validate regression model
|
||||
var xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.setFeaturesCol("feature_col")
|
||||
xgBoostModel.setLabelCol("label_col")
|
||||
xgBoostModel.setPredictionCol("prediction_col")
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
var loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostRegressionModel])
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "feature_col")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label_col")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction_col")
|
||||
// classification model
|
||||
paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(Array(0.5, 0.5))
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
|
||||
"raw_col")
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
|
||||
Array(0.5, 0.5).deep)
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "features")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction")
|
||||
// (multiclass) classification model
|
||||
trainingRDD = sc.parallelize(MultiClassification.train).map(_.asML)
|
||||
paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(
|
||||
Array(0.5, 0.5, 0.5, 0.5, 0.5, 0.5))
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
|
||||
"raw_col")
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
|
||||
Array(0.5, 0.5, 0.5, 0.5, 0.5, 0.5).deep)
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].numOfClasses == 6)
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "features")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction")
|
||||
}
|
||||
|
||||
test("copy and predict ClassificationModel") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testRDD = sc.parallelize(Classification.test).map(_.features)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val model = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
testCopy(model, testRDD)
|
||||
}
|
||||
|
||||
test("copy and predict RegressionModel") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Regression.train).map(_.asML)
|
||||
val testRDD = sc.parallelize(Regression.test).map(_.features)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
val model = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
testCopy(model, testRDD)
|
||||
}
|
||||
|
||||
private def testCopy(model: XGBoostModel, testRDD: RDD[Vector]): Unit = {
|
||||
val modelCopy = model.copy(ParamMap.empty)
|
||||
modelCopy.summary // Ensure no exception.
|
||||
|
||||
val expected = model.predict(testRDD).collect
|
||||
assert(modelCopy.predict(testRDD).collect === expected)
|
||||
}
|
||||
}
|
||||
@@ -1,191 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("XGBoost-Spark XGBoostRegressor ouput should match XGBoost4j: regression") {
|
||||
val trainingDM = new DMatrix(Regression.train.iterator)
|
||||
val testDM = new DMatrix(Regression.test.iterator)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostRegressor(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[Double]("prediction"))).toMap
|
||||
|
||||
assert(prediction1.indices.count { i =>
|
||||
math.abs(prediction1(i)(0) - prediction2(i)) > 0.01
|
||||
} < prediction1.length * 0.1)
|
||||
|
||||
|
||||
// check the equality of single instance prediction
|
||||
val firstOfDM = testDM.slice(Array(0))
|
||||
val firstOfDF = testDF.head().getAs[Vector]("features")
|
||||
val prediction3 = model1.predict(firstOfDM)(0)(0)
|
||||
val prediction4 = model2.predict(firstOfDF)
|
||||
assert(math.abs(prediction3 - prediction4) <= 0.01f)
|
||||
}
|
||||
|
||||
test("Set params in XGBoost and MLlib way should produce same model") {
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:linear",
|
||||
"num_round" -> round,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
val model1 = new XGBoostRegressor(paramMap).fit(trainingDF)
|
||||
// Set params in MLlib way
|
||||
val model2 = new XGBoostRegressor()
|
||||
.setEta(1)
|
||||
.setMaxDepth(6)
|
||||
.setSilent(1)
|
||||
.setObjective("reg:linear")
|
||||
.setNumRound(round)
|
||||
.setNumWorkers(numWorkers)
|
||||
.fit(trainingDF)
|
||||
|
||||
val prediction1 = model1.transform(testDF).select("prediction").collect()
|
||||
val prediction2 = model2.transform(testDF).select("prediction").collect()
|
||||
|
||||
prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
|
||||
assert(math.abs(p1 - p2) <= 0.01f)
|
||||
}
|
||||
}
|
||||
|
||||
test("ranking: use group data") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "num_workers" -> numWorkers, "num_round" -> 5,
|
||||
"group_col" -> "group")
|
||||
|
||||
val trainingDF = buildDataFrameWithGroup(Ranking.train)
|
||||
val testDF = buildDataFrame(Ranking.test)
|
||||
val model = new XGBoostRegressor(paramMap).fit(trainingDF)
|
||||
|
||||
val prediction = model.transform(testDF).collect()
|
||||
assert(testDF.count() === prediction.length)
|
||||
}
|
||||
|
||||
test("use weight") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f}, DataTypes.FloatType)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
.withColumn("weight", getWeightFromId(col("id")))
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
|
||||
val model = new XGBoostRegressor(paramMap).setWeightCol("weight").fit(trainingDF)
|
||||
val prediction = model.transform(testDF).collect()
|
||||
val first = prediction.head.getAs[Double]("prediction")
|
||||
prediction.foreach(x => assert(math.abs(x.getAs[Double]("prediction") - first) <= 0.01f))
|
||||
}
|
||||
|
||||
test("test predictionLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("predictLeaf")
|
||||
val resultDF = model.transform(testDF)
|
||||
assert(resultDF.count === groundTruth)
|
||||
assert(resultDF.columns.contains("predictLeaf"))
|
||||
}
|
||||
|
||||
test("test predictionLeaf with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("")
|
||||
val resultDF = model.transform(testDF)
|
||||
assert(!resultDF.columns.contains("predictLeaf"))
|
||||
}
|
||||
|
||||
test("test predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setContribPredictionCol("predictContrib")
|
||||
val resultDF = model.transform(testDF)
|
||||
assert(resultDF.count === groundTruth)
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
test("test predictionContrib with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setContribPredictionCol("")
|
||||
val resultDF = model.transform(testDF)
|
||||
assert(!resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
test("test predictionLeaf and predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
model.setLeafPredictionCol("predictLeaf")
|
||||
model.setContribPredictionCol("predictContrib")
|
||||
val resultDF = model.transform(testDF)
|
||||
assert(resultDF.count === groundTruth)
|
||||
assert(resultDF.columns.contains("predictLeaf"))
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.{File, FileNotFoundException}
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class XGBoostSparkPipelinePersistence extends FunSuite with PerTest
|
||||
with BeforeAndAfterAll {
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
delete(new File("./testxgbPipe"))
|
||||
delete(new File("./testxgbEst"))
|
||||
delete(new File("./testxgbModel"))
|
||||
delete(new File("./test2xgbModel"))
|
||||
}
|
||||
|
||||
private def delete(f: File) {
|
||||
if (f.exists()) {
|
||||
if (f.isDirectory()) {
|
||||
for (c <- f.listFiles()) {
|
||||
delete(c)
|
||||
}
|
||||
}
|
||||
if (!f.delete()) {
|
||||
throw new FileNotFoundException("Failed to delete file: " + f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostEstimator") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val xgbEstimator = new XGBoostEstimator(paramMap)
|
||||
xgbEstimator.write.overwrite().save("./testxgbEst")
|
||||
val loadedxgbEstimator = XGBoostEstimator.read.load("./testxgbEst")
|
||||
val loadedParamMap = loadedxgbEstimator.fromParamsToXGBParamMap
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v == loadedParamMap(k).toString)
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of a complete pipeline") {
|
||||
val conf = new SparkConf().setAppName("foo").setMaster("local[*]")
|
||||
val spark = SparkSession.builder().config(conf).getOrCreate()
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val r = new Random(0)
|
||||
val assembler = new VectorAssembler().setInputCols(Array("feature")).setOutputCol("features")
|
||||
val xgbEstimator = new XGBoostEstimator(paramMap)
|
||||
val pipeline = new Pipeline().setStages(Array(assembler, xgbEstimator))
|
||||
pipeline.write.overwrite().save("testxgbPipe")
|
||||
val loadedPipeline = Pipeline.read.load("testxgbPipe")
|
||||
val loadedEstimator = loadedPipeline.getStages(1).asInstanceOf[XGBoostEstimator]
|
||||
val loadedParamMap = loadedEstimator.fromParamsToXGBParamMap
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v == loadedParamMap(k).toString)
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostModel") {
|
||||
val conf = new SparkConf().setAppName("foo").setMaster("local[*]")
|
||||
val spark = SparkSession.builder().config(conf).getOrCreate()
|
||||
val r = new Random(0)
|
||||
// maybe move to shared context, but requires session to import implicits
|
||||
val df = spark.createDataFrame(Seq.fill(10000)(r.nextInt(2)).map(i => (i, i))).
|
||||
toDF("feature", "label")
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setInputCols(df.columns
|
||||
.filter(!_.contains("label")))
|
||||
.setOutputCol("features")
|
||||
val xgbEstimator = new XGBoostEstimator(Map("num_round" -> 10,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala")
|
||||
)).setFeaturesCol("features").setLabelCol("label")
|
||||
// separate
|
||||
val predModel = xgbEstimator.fit(vectorAssembler.transform(df))
|
||||
predModel.write.overwrite.save("test2xgbModel")
|
||||
val same2Model = XGBoostModel.load("test2xgbModel")
|
||||
|
||||
assert(java.util.Arrays.equals(predModel.booster.toByteArray, same2Model.booster.toByteArray))
|
||||
val predParamMap = predModel.extractParamMap()
|
||||
val same2ParamMap = same2Model.extractParamMap()
|
||||
assert(predParamMap.get(predModel.useExternalMemory)
|
||||
=== same2ParamMap.get(same2Model.useExternalMemory))
|
||||
assert(predParamMap.get(predModel.featuresCol) === same2ParamMap.get(same2Model.featuresCol))
|
||||
assert(predParamMap.get(predModel.predictionCol)
|
||||
=== same2ParamMap.get(same2Model.predictionCol))
|
||||
assert(predParamMap.get(predModel.labelCol) === same2ParamMap.get(same2Model.labelCol))
|
||||
assert(predParamMap.get(predModel.labelCol) === same2ParamMap.get(same2Model.labelCol))
|
||||
|
||||
// chained
|
||||
val predictionModel = new Pipeline().setStages(Array(vectorAssembler, xgbEstimator)).fit(df)
|
||||
predictionModel.write.overwrite.save("testxgbModel")
|
||||
val sameModel = PipelineModel.load("testxgbModel")
|
||||
|
||||
val predictionModelXGB = predictionModel.stages.collect { case xgb: XGBoostModel => xgb } head
|
||||
val sameModelXGB = sameModel.stages.collect { case xgb: XGBoostModel => xgb } head
|
||||
|
||||
assert(java.util.Arrays.equals(
|
||||
predictionModelXGB.booster.toByteArray,
|
||||
sameModelXGB.booster.toByteArray
|
||||
))
|
||||
val predictionModelXGBParamMap = predictionModel.extractParamMap()
|
||||
val sameModelXGBParamMap = sameModel.extractParamMap()
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.useExternalMemory)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.useExternalMemory))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.featuresCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.featuresCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.predictionCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.predictionCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.labelCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.labelCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.labelCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.labelCol))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,24 +16,22 @@
|
||||
|
||||
package org.apache.spark
|
||||
|
||||
import org.scalatest.FunSuite
|
||||
import _root_.ml.dmlc.xgboost4j.scala.spark.PerTest
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class SparkParallelismTrackerSuite extends FunSuite with PerTest {
|
||||
class SparkParallelismTrackerSuite extends FunSuite with BeforeAndAfterAll {
|
||||
var sc: SparkContext = _
|
||||
var numParallelism: Int = _
|
||||
|
||||
val numParallelism: Int = Runtime.getRuntime.availableProcessors()
|
||||
override def beforeAll(): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
.setMaster("local[*]")
|
||||
.setAppName("XGBoostSuite")
|
||||
sc = new SparkContext(conf)
|
||||
numParallelism = sc.defaultParallelism
|
||||
}
|
||||
|
||||
override protected def sparkSessionBuilder: SparkSession.Builder = SparkSession.builder()
|
||||
.master("local[*]")
|
||||
.appName("XGBoostSuite")
|
||||
.config("spark.ui.enabled", true)
|
||||
.config("spark.driver.memory", "512m")
|
||||
.config("spark.task.cpus", 1)
|
||||
|
||||
test("tracker should not affect execution result when timeout is not larger than 0") {
|
||||
test("tracker should not affect execution result") {
|
||||
val nWorkers = numParallelism
|
||||
val rdd: RDD[Int] = sc.parallelize(1 to nWorkers)
|
||||
val tracker = new SparkParallelismTracker(sc, 10000, nWorkers)
|
||||
@@ -56,21 +54,4 @@ class SparkParallelismTrackerSuite extends FunSuite with PerTest {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("tracker should throw exception if parallelism is not sufficient with" +
|
||||
" spark.task.cpus larger than 1") {
|
||||
sc.conf.set("spark.task.cpus", "2")
|
||||
val nWorkers = numParallelism
|
||||
val rdd: RDD[Int] = sc.parallelize(1 to nWorkers)
|
||||
val tracker = new SparkParallelismTracker(sc, 1000, nWorkers)
|
||||
intercept[IllegalStateException] {
|
||||
tracker.execute {
|
||||
rdd.map { i =>
|
||||
// Test interruption
|
||||
Thread.sleep(Long.MaxValue)
|
||||
i
|
||||
}.sum()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j</artifactId>
|
||||
<version>0.80</version>
|
||||
<version>0.72</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user