Compare commits
305 Commits
master-roc
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ecb7583e9 | ||
|
|
92f1c48a22 | ||
|
|
8cf2f7aed8 | ||
|
|
429f956111 | ||
|
|
c9f89c4241 | ||
|
|
9ee4008654 | ||
|
|
497f1bdd38 | ||
|
|
521324ba9c | ||
|
|
271f4a80e7 | ||
|
|
13b9874fd6 | ||
|
|
dac6e4daa1 | ||
|
|
1d6f9d91fc | ||
|
|
43ca23fdf2 | ||
|
|
cc2daadec3 | ||
|
|
86157b9480 | ||
|
|
83b5eabd70 | ||
|
|
df6b3e1481 | ||
|
|
72546e71a8 | ||
|
|
c648442a46 | ||
|
|
a049490cdb | ||
|
|
2179baa50c | ||
|
|
bc69a3e877 | ||
|
|
f3df0d0eb4 | ||
|
|
2a03685bff | ||
|
|
68a8865bc5 | ||
|
|
982ee34658 | ||
|
|
e228c1a121 | ||
|
|
215da76263 | ||
|
|
19b55b300b | ||
|
|
d7599e095b | ||
|
|
2a37a8880c | ||
|
|
24241ed6e3 | ||
|
|
d5e1c41b69 | ||
|
|
15c6172e09 | ||
|
|
96bbf80457 | ||
|
|
de00e07087 | ||
|
|
67c8c96784 | ||
|
|
d94f6679fc | ||
|
|
ed5f33df16 | ||
|
|
3ef8383d93 | ||
|
|
bba6aa74fb | ||
|
|
5f7f31d464 | ||
|
|
c69c4adb58 | ||
|
|
f52f11e1d7 | ||
|
|
ec8cfb3267 | ||
|
|
15b72571f3 | ||
|
|
4f88ada219 | ||
|
|
4503555274 | ||
|
|
e1a2c1bbb3 | ||
|
|
98ac153265 | ||
|
|
5cc7c735e5 | ||
|
|
34d4ab455e | ||
|
|
61dd854a52 | ||
|
|
34937fea41 | ||
|
|
7510a87466 | ||
|
|
4fe67f10b4 | ||
|
|
3043827efc | ||
|
|
7794d3da8a | ||
|
|
64afe9873b | ||
|
|
bde1265caf | ||
|
|
d6ebcfb032 | ||
|
|
12c6b7ceea | ||
|
|
06c4246ff1 | ||
|
|
25966e4ba8 | ||
|
|
074cad2343 | ||
|
|
479ae8081b | ||
|
|
fd0138c91c | ||
|
|
55aef8f546 | ||
|
|
dbfafd8557 | ||
|
|
cd83fe6033 | ||
|
|
142bdc73ec | ||
|
|
cb54374550 | ||
|
|
03bd1183bc | ||
|
|
24d225c1ab | ||
|
|
9b88495840 | ||
|
|
402e7837fb | ||
|
|
e9f1abc1f0 | ||
|
|
adf87b27c5 | ||
|
|
508ac13243 | ||
|
|
b949a4bf7b | ||
|
|
5db0803eb2 | ||
|
|
caabee2135 | ||
|
|
fd365c147e | ||
|
|
ec3f327c20 | ||
|
|
8d7fe262d9 | ||
|
|
033a666900 | ||
|
|
abe65e3769 | ||
|
|
2258bc870d | ||
|
|
582ea104b5 | ||
|
|
0def8e0bae | ||
|
|
773ded684b | ||
|
|
b457d0d792 | ||
|
|
2ecc85ffad | ||
|
|
43704549a2 | ||
|
|
d414fdf2e7 | ||
|
|
18b28d9315 | ||
|
|
fb9201abae | ||
|
|
e02b376bf7 | ||
|
|
7bccc1ea2c | ||
|
|
ac8366654b | ||
|
|
e555a238bc | ||
|
|
cc3b56fc37 | ||
|
|
6ccf116601 | ||
|
|
35b1cdb365 | ||
|
|
3d8107adb8 | ||
|
|
a269055b2b | ||
|
|
a185b693dc | ||
|
|
2e7ba900ef | ||
|
|
ad32b4e021 | ||
|
|
9e0a9a066b | ||
|
|
574c20dc1d | ||
|
|
77c844cef7 | ||
|
|
778751a1bb | ||
|
|
fb77ed7603 | ||
|
|
827d0e8edb | ||
|
|
757aafc131 | ||
|
|
d4b82f50ab | ||
|
|
449be7a402 | ||
|
|
7720272870 | ||
|
|
384983ed27 | ||
|
|
ec82c75ee7 | ||
|
|
d5834b68c3 | ||
|
|
fcae6301ec | ||
|
|
411c8466bd | ||
|
|
7949a8d5f4 | ||
|
|
b3ed81877a | ||
|
|
003b418312 | ||
|
|
485d90218c | ||
|
|
a19bbc9be5 | ||
|
|
b2cae34a8e | ||
|
|
f6cae4da85 | ||
|
|
6d9fcb771e | ||
|
|
cb62f9e73b | ||
|
|
0846ad860c | ||
|
|
344ddeb9ca | ||
|
|
326921dbe4 | ||
|
|
7ab93f3ce3 | ||
|
|
292bb677e5 | ||
|
|
e9fbce9791 | ||
|
|
07732e02e5 | ||
|
|
919cfd9c8d | ||
|
|
c41a657c4e | ||
|
|
ee8bb60bf1 | ||
|
|
a6a8a55ffa | ||
|
|
5a92ffe3ca | ||
|
|
370dce9d57 | ||
|
|
fa8fea145a | ||
|
|
bbd308595a | ||
|
|
ab982e7873 | ||
|
|
17c64300e3 | ||
|
|
b7511cbd6f | ||
|
|
a81ccab7e5 | ||
|
|
5b68b68379 | ||
|
|
0f789e2b22 | ||
|
|
8b77964d03 | ||
|
|
7996914a2d | ||
|
|
5b7c68946d | ||
|
|
6fc1088592 | ||
|
|
ce97de2a7c | ||
|
|
5fea9d24f2 | ||
|
|
6c403187ec | ||
|
|
1ca4bfd20e | ||
|
|
89da9f9741 | ||
|
|
5f910cd4ff | ||
|
|
34b154c284 | ||
|
|
baba3e9eb0 | ||
|
|
8e2b874b4c | ||
|
|
3ec74a1ba9 | ||
|
|
8d0f2bfbaa | ||
|
|
2266db17d1 | ||
|
|
0a3941be6d | ||
|
|
00264eb72b | ||
|
|
513d7a7d84 | ||
|
|
620b2b155a | ||
|
|
cd1d108c7d | ||
|
|
6243e7c43d | ||
|
|
628411a654 | ||
|
|
9cb4c938da | ||
|
|
e537b0969f | ||
|
|
d33043a348 | ||
|
|
a39fef2c67 | ||
|
|
5f0c1e902b | ||
|
|
804cf85fe4 | ||
|
|
09d32f1f2b | ||
|
|
e8a962575a | ||
|
|
824fba783e | ||
|
|
2d88d17008 | ||
|
|
bed3695beb | ||
|
|
4b88dfff24 | ||
|
|
5efc979551 | ||
|
|
08658b124d | ||
|
|
4abf24aa4f | ||
|
|
4c1920a6a5 | ||
|
|
d4dee25eb3 | ||
|
|
9a8bb7d186 | ||
|
|
c519f5690e | ||
|
|
124bc57a6e | ||
|
|
61ac8eec8a | ||
|
|
26eb68859f | ||
|
|
b38c7fe2ce | ||
|
|
2b400b18d5 | ||
|
|
e5f1720656 | ||
|
|
63418d2f35 | ||
|
|
45150a844e | ||
|
|
8689f0b562 | ||
|
|
b9e5229ff2 | ||
|
|
b4cc350ec5 | ||
|
|
a8ddbac163 | ||
|
|
bc3747bdce | ||
|
|
320e7c2041 | ||
|
|
6c83c8c2ef | ||
|
|
49e25cfb36 | ||
|
|
bbff74d2ff | ||
|
|
601f2067c7 | ||
|
|
b36d023f9e | ||
|
|
1ace9c66ec | ||
|
|
dc14f98f40 | ||
|
|
01ff2b2c29 | ||
|
|
cf0c1d0888 | ||
|
|
e0ebbc0746 | ||
|
|
0c44067736 | ||
|
|
c9f5fcaf21 | ||
|
|
f5815b6982 | ||
|
|
9f6608d6aa | ||
|
|
bc7643d35e | ||
|
|
9b7633c01d | ||
|
|
43a57c4a85 | ||
|
|
979e392deb | ||
|
|
0808e50ae8 | ||
|
|
c4ec64d409 | ||
|
|
4057f861c1 | ||
|
|
eb6622ff7a | ||
|
|
4847f24840 | ||
|
|
492bb76f64 | ||
|
|
7157b9586b | ||
|
|
99a7f5b3ab | ||
|
|
7f3e92d71a | ||
|
|
1164dc07cd | ||
|
|
6cfc3e16fc | ||
|
|
8286a190b7 | ||
|
|
4f48647932 | ||
|
|
92cba25fe2 | ||
|
|
d2d01d977a | ||
|
|
c2e3d4f3cd | ||
|
|
e6eefea5e2 | ||
|
|
0717e886e5 | ||
|
|
324f2d4e4a | ||
|
|
8998733ef4 | ||
|
|
bc6c993aaa | ||
|
|
2de67f0050 | ||
|
|
7354955cbb | ||
|
|
7ae5c972f9 | ||
|
|
e20ed8ab9c | ||
|
|
5627af6b21 | ||
|
|
949f062229 | ||
|
|
b2008773bb | ||
|
|
0058301e6f | ||
|
|
9def441e9a | ||
|
|
5086decb0c | ||
|
|
95ba0998b3 | ||
|
|
089bee0a00 | ||
|
|
5a084fb9b3 | ||
|
|
291d417f57 | ||
|
|
15eb553c1f | ||
|
|
932d7201f9 | ||
|
|
966dc81788 | ||
|
|
d5fcbee44b | ||
|
|
b8a7773736 | ||
|
|
e56ca69c31 | ||
|
|
1b25d23583 | ||
|
|
6a43a4b9d3 | ||
|
|
c7e7ce7569 | ||
|
|
7a54ca41c9 | ||
|
|
d66b5570f4 | ||
|
|
841867e05a | ||
|
|
e7f8f40240 | ||
|
|
d5c9ef64a5 | ||
|
|
a5a58102e5 | ||
|
|
ba9b4cb1ee | ||
|
|
835e59e538 | ||
|
|
ee2afb3256 | ||
|
|
ca1d04bcb7 | ||
|
|
f1f69ff10e | ||
|
|
871fabeee3 | ||
|
|
75fe2ff0c3 | ||
|
|
d81e319e78 | ||
|
|
5e816e616a | ||
|
|
5de57435c7 | ||
|
|
f588252481 | ||
|
|
9b465052ce | ||
|
|
8237920c48 | ||
|
|
73afef1a6e | ||
|
|
dcc9639b91 | ||
|
|
5e64276a9b | ||
|
|
837d44a345 | ||
|
|
f8c3d22587 | ||
|
|
54754f29dd | ||
|
|
f355418186 | ||
|
|
4d69ce96b3 | ||
|
|
a5003fc8ce | ||
|
|
8ed85b8ce7 | ||
|
|
edb945d59b | ||
|
|
a81b78e56b | ||
|
|
58513dc288 | ||
|
|
59d7b8dc72 |
8
.github/dependabot.yml
vendored
8
.github/dependabot.yml
vendored
@ -12,7 +12,7 @@ updates:
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/jvm-packages/xgboost4j"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
interval: "monthly"
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/jvm-packages/xgboost4j-gpu"
|
||||
schedule:
|
||||
@ -24,8 +24,12 @@ updates:
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/jvm-packages/xgboost4j-spark"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
interval: "monthly"
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/jvm-packages/xgboost4j-spark-gpu"
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: /
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
|
||||
34
.github/workflows/freebsd.yml
vendored
Normal file
34
.github/workflows/freebsd.yml
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
name: FreeBSD
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
name: A job to run test in FreeBSD
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Test in FreeBSD
|
||||
id: test
|
||||
uses: vmactions/freebsd-vm@v1
|
||||
with:
|
||||
usesh: true
|
||||
prepare: |
|
||||
pkg install -y cmake git ninja googletest
|
||||
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DGOOGLE_TEST=ON
|
||||
ninja -v
|
||||
./testxgboost
|
||||
6
.github/workflows/i386.yml
vendored
6
.github/workflows/i386.yml
vendored
@ -19,15 +19,15 @@ jobs:
|
||||
ports:
|
||||
- 5000:5000
|
||||
steps:
|
||||
- uses: actions/checkout@v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@v3.6.1
|
||||
with:
|
||||
driver-opts: network=host
|
||||
- name: Build and push container
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
file: tests/ci_build/Dockerfile.i386
|
||||
|
||||
59
.github/workflows/jvm_tests.yml
vendored
59
.github/workflows/jvm_tests.yml
vendored
@ -12,48 +12,50 @@ concurrency:
|
||||
jobs:
|
||||
test-with-jvm:
|
||||
name: Test JVM on OS ${{ matrix.os }}
|
||||
timeout-minutes: 30
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [windows-latest, ubuntu-latest, macos-11]
|
||||
os: [windows-latest, ubuntu-latest, macos-13]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7 # v1.8.1
|
||||
- uses: actions/setup-java@6a0805fcefea3d4657a47ac4c165951e33482018 # v4.2.2
|
||||
with:
|
||||
micromamba-version: '1.5.6-0'
|
||||
environment-name: jvm_tests
|
||||
create-args: >-
|
||||
python=3.10
|
||||
awscli
|
||||
cache-downloads: true
|
||||
cache-environment: true
|
||||
init-shell: bash powershell
|
||||
distribution: 'temurin'
|
||||
java-version: '8'
|
||||
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: jvm_tests
|
||||
environment-file: tests/ci_build/conda_env/jvm_tests.yml
|
||||
use-mamba: true
|
||||
|
||||
- name: Cache Maven packages
|
||||
uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
|
||||
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
|
||||
with:
|
||||
path: ~/.m2
|
||||
key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
|
||||
restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
|
||||
|
||||
- name: Build xgboost4j.dll
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -G"Visual Studio 17 2022" -A x64 -DJVM_BINDINGS=ON
|
||||
cmake --build . --config Release
|
||||
if: matrix.os == 'windows-latest'
|
||||
|
||||
- name: Test XGBoost4J (Core)
|
||||
run: |
|
||||
cd jvm-packages
|
||||
mvn test -B -pl :xgboost4j_2.12
|
||||
|
||||
- name: Test XGBoost4J (Core, Spark, Examples)
|
||||
run: |
|
||||
rm -rfv build/
|
||||
cd jvm-packages
|
||||
mvn -B test
|
||||
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
|
||||
|
||||
- name: Extract branch name
|
||||
shell: bash
|
||||
run: |
|
||||
@ -61,7 +63,7 @@ jobs:
|
||||
id: extract_branch
|
||||
if: |
|
||||
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
|
||||
(matrix.os == 'windows-latest' || matrix.os == 'macos-11')
|
||||
(matrix.os == 'windows-latest' || matrix.os == 'macos-13')
|
||||
|
||||
- name: Publish artifact xgboost4j.dll to S3
|
||||
run: |
|
||||
@ -85,27 +87,14 @@ jobs:
|
||||
python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
|
||||
if: |
|
||||
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
|
||||
matrix.os == 'macos-11'
|
||||
matrix.os == 'macos-13'
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
|
||||
|
||||
|
||||
- name: Test XGBoost4J (Core, Spark, Examples)
|
||||
run: |
|
||||
rm -rfv build/
|
||||
cd jvm-packages
|
||||
mvn -B test
|
||||
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
|
||||
env:
|
||||
RABIT_MOCK: ON
|
||||
|
||||
|
||||
- name: Build and Test XGBoost4J with scala 2.13
|
||||
run: |
|
||||
rm -rfv build/
|
||||
cd jvm-packages
|
||||
mvn -B clean install test -Pdefault,scala-2.13
|
||||
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
|
||||
env:
|
||||
RABIT_MOCK: ON
|
||||
|
||||
46
.github/workflows/main.yml
vendored
46
.github/workflows/main.yml
vendored
@ -21,9 +21,9 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-11]
|
||||
os: [macos-12]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Install system packages
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
|
||||
cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo
|
||||
ninja -v
|
||||
- name: Run gtest binary
|
||||
run: |
|
||||
@ -49,7 +49,7 @@ jobs:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Install system packages
|
||||
@ -74,18 +74,18 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: ["3.8"]
|
||||
python-version: ["3.10"]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: linux_sycl_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: linux_sycl_test
|
||||
environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
|
||||
|
||||
use-mamba: true
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
conda info
|
||||
@ -95,7 +95,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
|
||||
cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
|
||||
make -j$(nproc)
|
||||
- name: Run gtest binary for SYCL
|
||||
run: |
|
||||
@ -116,17 +116,18 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: ["ubuntu-latest"]
|
||||
python-version: ["3.8"]
|
||||
python-version: ["3.10"]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: cpp_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: cpp_test
|
||||
environment-file: tests/ci_build/conda_env/cpp_test.yml
|
||||
use-mamba: true
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
conda info
|
||||
@ -155,8 +156,9 @@ jobs:
|
||||
- name: Build and install XGBoost shared library
|
||||
run: |
|
||||
cd build
|
||||
cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
|
||||
cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON
|
||||
ninja -v install
|
||||
./testxgboost
|
||||
cd -
|
||||
- name: Build and run C API demo with shared
|
||||
run: |
|
||||
@ -175,12 +177,12 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
name: Code linting for C++
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.10"
|
||||
architecture: 'x64'
|
||||
- name: Install Python packages
|
||||
run: |
|
||||
|
||||
87
.github/workflows/python_tests.yml
vendored
87
.github/workflows/python_tests.yml
vendored
@ -21,15 +21,16 @@ jobs:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: python_lint
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: python_lint
|
||||
environment-file: tests/ci_build/conda_env/python_lint.yml
|
||||
use-mamba: true
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
conda info
|
||||
@ -52,15 +53,16 @@ jobs:
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: sdist_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: sdist_test
|
||||
environment-file: tests/ci_build/conda_env/sdist_test.yml
|
||||
use-mamba: true
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
conda info
|
||||
@ -81,17 +83,17 @@ jobs:
|
||||
name: Test installing XGBoost Python source package on ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [macos-11, windows-latest]
|
||||
python-version: ["3.8"]
|
||||
os: [macos-13, windows-latest]
|
||||
python-version: ["3.10"]
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Install osx system dependencies
|
||||
if: matrix.os == 'macos-11'
|
||||
if: matrix.os == 'macos-13'
|
||||
run: |
|
||||
brew install ninja libomp
|
||||
- uses: conda-incubator/setup-miniconda@35d1405e78aa3f784fe3ce9a2eb378d5eeb62169 # v2.1.1
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
auto-update-conda: true
|
||||
python-version: ${{ matrix.python-version }}
|
||||
@ -119,19 +121,20 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- {os: macos-11}
|
||||
- {os: macos-13}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: macos_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: macos_cpu_test
|
||||
environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
|
||||
use-mamba: true
|
||||
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
@ -171,14 +174,14 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- {os: windows-latest, python-version: '3.8'}
|
||||
- {os: windows-latest, python-version: '3.10'}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: conda-incubator/setup-miniconda@35d1405e78aa3f784fe3ce9a2eb378d5eeb62169 # v2.1.1
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
auto-update-conda: true
|
||||
python-version: ${{ matrix.config.python-version }}
|
||||
@ -215,19 +218,20 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- {os: ubuntu-latest, python-version: "3.8"}
|
||||
- {os: ubuntu-latest, python-version: "3.10"}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: linux_cpu_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: linux_cpu_test
|
||||
environment-file: tests/ci_build/conda_env/linux_cpu_test.yml
|
||||
use-mamba: true
|
||||
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
@ -267,19 +271,20 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- {os: ubuntu-latest, python-version: "3.8"}
|
||||
- {os: ubuntu-latest, python-version: "3.10"}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
cache-downloads: true
|
||||
cache-env: true
|
||||
environment-name: linux_sycl_test
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
activate-environment: linux_sycl_test
|
||||
environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
|
||||
use-mamba: true
|
||||
|
||||
- name: Display Conda env
|
||||
run: |
|
||||
@ -289,7 +294,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
||||
cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
||||
make -j$(nproc)
|
||||
- name: Install Python package
|
||||
run: |
|
||||
@ -309,14 +314,14 @@ jobs:
|
||||
os: [ubuntu-latest]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: 3.8
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install ninja
|
||||
run: |
|
||||
|
||||
28
.github/workflows/python_wheels.yml
vendored
28
.github/workflows/python_wheels.yml
vendored
@ -5,6 +5,10 @@ on: [push, pull_request]
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -l {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
@ -16,30 +20,36 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- os: macos-latest
|
||||
- os: macos-13
|
||||
platform_id: macosx_x86_64
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
platform_id: macosx_arm64
|
||||
steps:
|
||||
- uses: actions/checkout@a12a3943b4bdde767164f792f33f40b04645d846 # v3.0.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
- name: Set up homebrew
|
||||
uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54
|
||||
- name: Install libomp
|
||||
run: brew install libomp
|
||||
- uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
python-version: "3.10"
|
||||
use-mamba: true
|
||||
- name: Build wheels
|
||||
run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }}
|
||||
- name: Extract branch name
|
||||
shell: bash
|
||||
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
run: |
|
||||
echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT"
|
||||
id: extract_branch
|
||||
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
|
||||
- name: Upload Python wheel
|
||||
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
|
||||
run: |
|
||||
python -m pip install awscli
|
||||
python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
|
||||
python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
|
||||
|
||||
2
.github/workflows/r_nold.yml
vendored
2
.github/workflows/r_nold.yml
vendored
@ -27,7 +27,7 @@ jobs:
|
||||
run: |
|
||||
apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
|
||||
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
|
||||
28
.github/workflows/r_tests.yml
vendored
28
.github/workflows/r_tests.yml
vendored
@ -25,20 +25,20 @@ jobs:
|
||||
RSPM: ${{ matrix.config.rspm }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@e40ad904310fc92e96951c1b0d64f3de6cbe9e14 # v2.6.5
|
||||
- uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
- name: Cache R packages
|
||||
uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
|
||||
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
|
||||
with:
|
||||
path: ${{ env.R_LIBS_USER }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
|
||||
- name: Install dependencies
|
||||
shell: Rscript {0}
|
||||
@ -69,24 +69,24 @@ jobs:
|
||||
sudo apt update
|
||||
sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
|
||||
if: matrix.config.os == 'ubuntu-latest'
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@e40ad904310fc92e96951c1b0d64f3de6cbe9e14 # v2.6.5
|
||||
- uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
- name: Cache R packages
|
||||
uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
|
||||
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
|
||||
with:
|
||||
path: ${{ env.R_LIBS_USER }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
|
||||
|
||||
- uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.10"
|
||||
architecture: 'x64'
|
||||
|
||||
- uses: r-lib/actions/setup-tinytex@v2
|
||||
@ -123,7 +123,7 @@ jobs:
|
||||
run: |
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
|
||||
|
||||
- uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
@ -137,7 +137,7 @@ jobs:
|
||||
run: |
|
||||
python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
|
||||
|
||||
- uses: dorny/paths-filter@v2
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: changes
|
||||
with:
|
||||
filters: |
|
||||
|
||||
6
.github/workflows/scorecards.yml
vendored
6
.github/workflows/scorecards.yml
vendored
@ -22,12 +22,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@a12a3943b4bdde767164f792f33f40b04645d846 # v3.0.0
|
||||
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
|
||||
uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
@ -41,7 +41,7 @@ jobs:
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
|
||||
2
.github/workflows/update_rapids.yml
vendored
2
.github/workflows/update_rapids.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
name: Check latest RAPIDS
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
||||
with:
|
||||
submodules: 'true'
|
||||
- name: Check latest RAPIDS and update conftest.sh
|
||||
|
||||
9
.gitignore
vendored
9
.gitignore
vendored
@ -27,12 +27,13 @@
|
||||
*vali
|
||||
*sdf
|
||||
Release
|
||||
*exe*
|
||||
*exe
|
||||
*exp
|
||||
ipch
|
||||
*.filters
|
||||
*.user
|
||||
*log
|
||||
rmm_log.txt
|
||||
Debug
|
||||
*suo
|
||||
.Rhistory
|
||||
@ -63,6 +64,7 @@ java/xgboost4j-demo/data/
|
||||
java/xgboost4j-demo/tmp/
|
||||
java/xgboost4j-demo/model/
|
||||
nb-configuration*
|
||||
|
||||
# Eclipse
|
||||
.project
|
||||
.cproject
|
||||
@ -84,6 +86,7 @@ target
|
||||
*.gcov
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.ubj
|
||||
build_tests
|
||||
/tests/cpp/xgboost_test
|
||||
|
||||
@ -97,6 +100,7 @@ metastore_db
|
||||
|
||||
# files from R-package source install
|
||||
**/config.status
|
||||
R-package/config.h
|
||||
R-package/src/Makevars
|
||||
*.lib
|
||||
|
||||
@ -152,3 +156,6 @@ model*.json
|
||||
*.rds
|
||||
Rplots.pdf
|
||||
*.zip
|
||||
|
||||
# nsys
|
||||
*.nsys-rep
|
||||
@ -12,7 +12,7 @@ submodules:
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
python: "3.10"
|
||||
apt_packages:
|
||||
- graphviz
|
||||
- cmake
|
||||
|
||||
@ -1,12 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
|
||||
|
||||
if(PLUGIN_SYCL)
|
||||
set(CMAKE_CXX_COMPILER "g++")
|
||||
set(CMAKE_C_COMPILER "gcc")
|
||||
string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
|
||||
project(xgboost LANGUAGES CXX C VERSION 2.1.0)
|
||||
project(xgboost LANGUAGES CXX C VERSION 2.2.0)
|
||||
include(cmake/Utils.cmake)
|
||||
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
||||
|
||||
@ -69,12 +67,10 @@ option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
|
||||
option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
|
||||
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
||||
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
|
||||
option(RABIT_MOCK "Build rabit with mock" OFF)
|
||||
option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
|
||||
option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
|
||||
## CUDA
|
||||
option(USE_CUDA "Build with GPU acceleration" OFF)
|
||||
option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
|
||||
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||
# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
|
||||
option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
|
||||
@ -222,9 +218,26 @@ if(USE_CUDA)
|
||||
if(DEFINED GPU_COMPUTE_VER)
|
||||
compute_cmake_cuda_archs("${GPU_COMPUTE_VER}")
|
||||
endif()
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
find_package(CCCL CONFIG)
|
||||
if(NOT CCCL_FOUND)
|
||||
message(STATUS "Standalone CCCL not found. Attempting to use CCCL from CUDA Toolkit...")
|
||||
find_package(CCCL CONFIG
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
|
||||
if(NOT CCCL_FOUND)
|
||||
message(STATUS "Could not locate CCCL from CUDA Toolkit. Using Thrust and CUB from CUDA Toolkit...")
|
||||
find_package(libcudacxx CONFIG REQUIRED
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
|
||||
find_package(CUB CONFIG REQUIRED
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
|
||||
find_package(Thrust CONFIG REQUIRED
|
||||
HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
|
||||
thrust_create_target(Thrust HOST CPP DEVICE CUDA)
|
||||
add_library(CCCL::CCCL INTERFACE IMPORTED GLOBAL)
|
||||
target_link_libraries(CCCL::CCCL INTERFACE libcudacxx::libcudacxx CUB::CUB Thrust)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||
@ -235,28 +248,17 @@ endif()
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# -- OpenMP
|
||||
include(cmake/FindOpenMPMacOS.cmake)
|
||||
if(USE_OPENMP)
|
||||
if(APPLE)
|
||||
find_package(OpenMP)
|
||||
if(NOT OpenMP_FOUND)
|
||||
# Try again with extra path info; required for libomp 15+ from Homebrew
|
||||
execute_process(COMMAND brew --prefix libomp
|
||||
OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
set(OpenMP_C_FLAGS
|
||||
"-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
|
||||
set(OpenMP_CXX_FLAGS
|
||||
"-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
|
||||
set(OpenMP_C_LIB_NAMES omp)
|
||||
set(OpenMP_CXX_LIB_NAMES omp)
|
||||
set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib)
|
||||
find_package(OpenMP REQUIRED)
|
||||
endif()
|
||||
find_openmp_macos()
|
||||
else()
|
||||
find_package(OpenMP REQUIRED)
|
||||
endif()
|
||||
endif()
|
||||
#Add for IBM i
|
||||
|
||||
# Add for IBM i
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "OS400")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>")
|
||||
@ -266,11 +268,18 @@ if(USE_NCCL)
|
||||
find_package(Nccl REQUIRED)
|
||||
endif()
|
||||
|
||||
# dmlc-core
|
||||
msvc_use_static_runtime()
|
||||
if(FORCE_SHARED_CRT)
|
||||
set(DMLC_FORCE_SHARED_CRT ON)
|
||||
if(MSVC)
|
||||
if(FORCE_SHARED_CRT)
|
||||
message(STATUS "XGBoost: Using dynamically linked MSVC runtime...")
|
||||
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
|
||||
else()
|
||||
message(STATUS "XGBoost: Using statically linked MSVC runtime...")
|
||||
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# dmlc-core
|
||||
set(DMLC_FORCE_SHARED_CRT ${FORCE_SHARED_CRT})
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
|
||||
|
||||
if(MSVC)
|
||||
@ -282,9 +291,6 @@ if(MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# rabit
|
||||
add_subdirectory(rabit)
|
||||
|
||||
# core xgboost
|
||||
add_subdirectory(${xgboost_SOURCE_DIR}/src)
|
||||
target_link_libraries(objxgboost PUBLIC dmlc)
|
||||
@ -350,7 +356,6 @@ if(BUILD_DEPRECATED_CLI)
|
||||
PRIVATE
|
||||
${xgboost_SOURCE_DIR}/include
|
||||
${xgboost_SOURCE_DIR}/dmlc-core/include
|
||||
${xgboost_SOURCE_DIR}/rabit/include
|
||||
)
|
||||
set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
|
||||
xgboost_target_properties(runxgboost)
|
||||
@ -378,6 +383,10 @@ if(JVM_BINDINGS)
|
||||
xgboost_target_defs(xgboost4j)
|
||||
endif()
|
||||
|
||||
if(USE_OPENMP AND APPLE)
|
||||
patch_openmp_path_macos(xgboost libxgboost)
|
||||
endif()
|
||||
|
||||
if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
|
||||
set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
|
||||
else()
|
||||
@ -494,11 +503,6 @@ if(GOOGLE_TEST)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# For MSVC: Call msvc_use_static_runtime() once again to completely
|
||||
# replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
|
||||
# for issues caused by mixing of /MD and /MT flags
|
||||
msvc_use_static_runtime()
|
||||
|
||||
# Add xgboost.pc
|
||||
if(ADD_PKGCONFIG)
|
||||
configure_file(${xgboost_SOURCE_DIR}/cmake/xgboost.pc.in ${xgboost_BINARY_DIR}/xgboost.pc @ONLY)
|
||||
|
||||
2
NEWS.md
2
NEWS.md
@ -1,6 +1,8 @@
|
||||
XGBoost Change Log
|
||||
==================
|
||||
|
||||
**Starting from 2.1.0, release note is recorded in the documentation.**
|
||||
|
||||
This file records the changes in xgboost library in reverse chronological order.
|
||||
|
||||
## 2.0.0 (2023 Aug 16)
|
||||
|
||||
@ -29,7 +29,6 @@ target_compile_definitions(
|
||||
-DDMLC_LOG_BEFORE_THROW=0
|
||||
-DDMLC_DISABLE_STDIN=1
|
||||
-DDMLC_LOG_CUSTOMIZE=1
|
||||
-DRABIT_STRICT_CXX98_
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
@ -37,7 +36,6 @@ target_include_directories(
|
||||
${LIBR_INCLUDE_DIRS}
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
${PROJECT_SOURCE_DIR}/dmlc-core/include
|
||||
${PROJECT_SOURCE_DIR}/rabit/include
|
||||
)
|
||||
|
||||
target_link_libraries(xgboost-r PUBLIC ${LIBR_CORE_LIBRARY})
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 2.1.0.0
|
||||
Date: 2023-08-19
|
||||
Version: 2.2.0.0
|
||||
Date: 2024-06-03
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
email = "tianqi.tchen@gmail.com"),
|
||||
@ -57,7 +57,8 @@ Suggests:
|
||||
igraph (>= 1.0.1),
|
||||
float,
|
||||
titanic,
|
||||
RhpcBLASctl
|
||||
RhpcBLASctl,
|
||||
survival
|
||||
Depends:
|
||||
R (>= 4.3.0)
|
||||
Imports:
|
||||
@ -66,6 +67,6 @@ Imports:
|
||||
data.table (>= 1.9.6),
|
||||
jsonlite (>= 1.0)
|
||||
Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.3.1
|
||||
RoxygenNote: 7.3.2
|
||||
Encoding: UTF-8
|
||||
SystemRequirements: GNU make, C++17
|
||||
|
||||
@ -13,6 +13,7 @@ S3method(predict,xgb.Booster)
|
||||
S3method(print,xgb.Booster)
|
||||
S3method(print,xgb.DMatrix)
|
||||
S3method(print,xgb.cv.synchronous)
|
||||
S3method(print,xgboost)
|
||||
S3method(setinfo,xgb.Booster)
|
||||
S3method(setinfo,xgb.DMatrix)
|
||||
S3method(variable.names,xgb.Booster)
|
||||
@ -28,7 +29,7 @@ export(xgb.DMatrix.hasinfo)
|
||||
export(xgb.DMatrix.save)
|
||||
export(xgb.DataBatch)
|
||||
export(xgb.DataIter)
|
||||
export(xgb.ExternalDMatrix)
|
||||
export(xgb.ExtMemDMatrix)
|
||||
export(xgb.QuantileDMatrix)
|
||||
export(xgb.QuantileDMatrix.from_iterator)
|
||||
export(xgb.attr)
|
||||
|
||||
@ -1,172 +1,166 @@
|
||||
.reserved_cb_names <- c("names", "class", "call", "params", "niter", "nfeatures", "folds")
|
||||
|
||||
#' @title XGBoost Callback Constructor
|
||||
#' @description Constructor for defining the structure of callback functions that can be executed
|
||||
#' XGBoost Callback Constructor
|
||||
#'
|
||||
#' Constructor for defining the structure of callback functions that can be executed
|
||||
#' at different stages of model training (before / after training, before / after each boosting
|
||||
#' iteration).
|
||||
#' @param cb_name Name for the callback.
|
||||
#'
|
||||
#' If the callback produces some non-NULL result (from executing the function passed under
|
||||
#' `f_after_training`), that result will be added as an R attribute to the resulting booster
|
||||
#' (or as a named element in the result of CV), with the attribute name specified here.
|
||||
#' @details
|
||||
#' Arguments that will be passed to the supplied functions are as follows:
|
||||
#' - env The same environment that is passed under argument `env`.
|
||||
#'
|
||||
#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
|
||||
#' @param env An environment object that will be passed to the different functions in the callback.
|
||||
#' Note that this environment will not be shared with other callbacks.
|
||||
#' @param f_before_training A function that will be executed before the training has started.
|
||||
#' It may be modified by the functions in order to e.g. keep tracking of what happens
|
||||
#' across iterations or similar.
|
||||
#'
|
||||
#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
|
||||
#' This environment is only used by the functions supplied to the callback, and will
|
||||
#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
|
||||
#'
|
||||
#' If passing a function, it will be called with parameters supplied as non-named arguments
|
||||
#' matching the function signatures that are shown in the default value for each function argument.
|
||||
#' @param f_before_iter A function that will be executed before each boosting round.
|
||||
#' - model The booster object when using [xgb.train()], or the folds when using [xgb.cv()].
|
||||
#'
|
||||
#' This function can signal whether the training should be finalized or not, by outputting
|
||||
#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
|
||||
#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
|
||||
#' For [xgb.cv()], folds are a list with a structure as follows:
|
||||
#' - `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
|
||||
#' - `bst`: Rhe `xgb.Booster` object for the fold.
|
||||
#' - `evals`: A list containing two DMatrices, with names `train` and `test`
|
||||
#' (`test` is the held-out data for the fold).
|
||||
#' - `index`: The indices of the hold-out data for that fold (base-1 indexing),
|
||||
#' from which the `test` entry in `evals` was obtained.
|
||||
#'
|
||||
#' Return values of `NULL` will be interpreted as `FALSE`.
|
||||
#' @param f_after_iter A function that will be executed after each boosting round.
|
||||
#' This object should **not** be in-place modified in ways that conflict with the
|
||||
#' training (e.g. resetting the parameters for a training update in a way that resets
|
||||
#' the number of rounds to zero in order to overwrite rounds).
|
||||
#'
|
||||
#' This function can signal whether the training should be finalized or not, by outputting
|
||||
#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
|
||||
#' a given round is `TRUE`, then training will be stopped at that round.
|
||||
#' Note that any R attributes that are assigned to the booster during the callback functions,
|
||||
#' will not be kept thereafter as the booster object variable is not re-assigned during
|
||||
#' training. It is however possible to set C-level attributes of the booster through
|
||||
#' [xgb.attr()] or [xgb.attributes()], which should remain available for the rest
|
||||
#' of the iterations and after the training is done.
|
||||
#'
|
||||
#' Return values of `NULL` will be interpreted as `FALSE`.
|
||||
#' @param f_after_training A function that will be executed after training is finished.
|
||||
#' For keeping variables across iterations, it's recommended to use `env` instead.
|
||||
#' - data The data to which the model is being fit, as an `xgb.DMatrix` object.
|
||||
#'
|
||||
#' This function can optionally output something non-NULL, which will become part of the R
|
||||
#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to \link{xgb.train})
|
||||
#' under the name supplied for parameter `cb_name` imn the case of \link{xgb.train}; or a part
|
||||
#' of the named elements in the result of \link{xgb.cv}.
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' @details Arguments that will be passed to the supplied functions are as follows:\itemize{
|
||||
#' Note that, for [xgb.cv()], this will be the full data, while data for the specific
|
||||
#' folds can be found in the `model` object.
|
||||
#' - evals The evaluation data, as passed under argument `evals` to [xgb.train()].
|
||||
#'
|
||||
#' \item env The same environment that is passed under argument `env`.
|
||||
#' For [xgb.cv()], this will always be `NULL`.
|
||||
#' - begin_iteration Index of the first boosting iteration that will be executed (base-1 indexing).
|
||||
#'
|
||||
#' It may be modified by the functions in order to e.g. keep tracking of what happens
|
||||
#' across iterations or similar.
|
||||
#' This will typically be '1', but when using training continuation, depending on the
|
||||
#' parameters for updates, boosting rounds will be continued from where the previous
|
||||
#' model ended, in which case this will be larger than 1.
|
||||
#'
|
||||
#' This environment is only used by the functions supplied to the callback, and will
|
||||
#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
|
||||
#' - end_iteration Index of the last boostign iteration that will be executed
|
||||
#' (base-1 indexing, inclusive of this end).
|
||||
#'
|
||||
#' \item model The booster object when using \link{xgb.train}, or the folds when using
|
||||
#' \link{xgb.cv}.
|
||||
#' It should match with argument `nrounds` passed to [xgb.train()] or [xgb.cv()].
|
||||
#'
|
||||
#' For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
|
||||
#' \item `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
|
||||
#' \item `bst`: Rhe `xgb.Booster` object for the fold.
|
||||
#' \item `evals`: A list containing two DMatrices, with names `train` and `test`
|
||||
#' (`test` is the held-out data for the fold).
|
||||
#' \item `index`: The indices of the hold-out data for that fold (base-1 indexing),
|
||||
#' from which the `test` entry in `evals` was obtained.
|
||||
#' }
|
||||
#' Note that boosting might be interrupted before reaching this last iteration, for
|
||||
#' example by using the early stopping callback [xgb.cb.early.stop()].
|
||||
#' - iteration Index of the iteration number that is being executed (first iteration
|
||||
#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
|
||||
#'
|
||||
#' This object should \bold{not} be in-place modified in ways that conflict with the
|
||||
#' training (e.g. resetting the parameters for a training update in a way that resets
|
||||
#' the number of rounds to zero in order to overwrite rounds).
|
||||
#' - iter_feval Evaluation metrics for `evals` that were supplied, either
|
||||
#' determined by the objective, or by parameter `feval`.
|
||||
#'
|
||||
#' Note that any R attributes that are assigned to the booster during the callback functions,
|
||||
#' will not be kept thereafter as the booster object variable is not re-assigned during
|
||||
#' training. It is however possible to set C-level attributes of the booster through
|
||||
#' \link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
|
||||
#' of the iterations and after the training is done.
|
||||
#' For [xgb.train()], this will be a named vector with one entry per element in
|
||||
#' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
|
||||
#' example, if `evals` contains an entry named "tr" and the metric is "rmse",
|
||||
#' this will be a one-element vector with name "tr-rmse".
|
||||
#'
|
||||
#' For keeping variables across iterations, it's recommended to use `env` instead.
|
||||
#' \item data The data to which the model is being fit, as an `xgb.DMatrix` object.
|
||||
#' For [xgb.cv()], this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
|
||||
#' where the row names will follow the same naming logic as the one-dimensional vector
|
||||
#' that is passed in [xgb.train()].
|
||||
#'
|
||||
#' Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
|
||||
#' folds can be found in the `model` object.
|
||||
#' Note that, internally, the built-in callbacks such as [xgb.cb.print.evaluation] summarize
|
||||
#' this table by calculating the row-wise means and standard deviations.
|
||||
#'
|
||||
#' \item evals The evaluation data, as passed under argument `evals` to
|
||||
#' \link{xgb.train}.
|
||||
#' - final_feval The evaluation results after the last boosting round is executed
|
||||
#' (same format as `iter_feval`, and will be the exact same input as passed under
|
||||
#' `iter_feval` to the last round that is executed during model fitting).
|
||||
#'
|
||||
#' For \link{xgb.cv}, this will always be `NULL`.
|
||||
#' - prev_cb_res Result from a previous run of a callback sharing the same name
|
||||
#' (as given by parameter `cb_name`) when conducting training continuation, if there
|
||||
#' was any in the booster R attributes.
|
||||
#'
|
||||
#' \item begin_iteration Index of the first boosting iteration that will be executed
|
||||
#' (base-1 indexing).
|
||||
#' Sometimes, one might want to append the new results to the previous one, and this will
|
||||
#' be done automatically by the built-in callbacks such as [xgb.cb.evaluation.log],
|
||||
#' which will append the new rows to the previous table.
|
||||
#'
|
||||
#' This will typically be '1', but when using training continuation, depending on the
|
||||
#' parameters for updates, boosting rounds will be continued from where the previous
|
||||
#' model ended, in which case this will be larger than 1.
|
||||
#' If no such previous callback result is available (which it never will when fitting
|
||||
#' a model from start instead of updating an existing model), this will be `NULL`.
|
||||
#'
|
||||
#' \item end_iteration Index of the last boostign iteration that will be executed
|
||||
#' (base-1 indexing, inclusive of this end).
|
||||
#' For [xgb.cv()], which doesn't support training continuation, this will always be `NULL`.
|
||||
#'
|
||||
#' It should match with argument `nrounds` passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' The following names (`cb_name` values) are reserved for internal callbacks:
|
||||
#' - print_evaluation
|
||||
#' - evaluation_log
|
||||
#' - reset_parameters
|
||||
#' - early_stop
|
||||
#' - save_model
|
||||
#' - cv_predict
|
||||
#' - gblinear_history
|
||||
#'
|
||||
#' Note that boosting might be interrupted before reaching this last iteration, for
|
||||
#' example by using the early stopping callback \link{xgb.cb.early.stop}.
|
||||
#' The following names are reserved for other non-callback attributes:
|
||||
#' - names
|
||||
#' - class
|
||||
#' - call
|
||||
#' - params
|
||||
#' - niter
|
||||
#' - nfeatures
|
||||
#' - folds
|
||||
#'
|
||||
#' \item iteration Index of the iteration number that is being executed (first iteration
|
||||
#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
|
||||
#'
|
||||
#' \item iter_feval Evaluation metrics for `evals` that were supplied, either
|
||||
#' determined by the objective, or by parameter `feval`.
|
||||
#'
|
||||
#' For \link{xgb.train}, this will be a named vector with one entry per element in
|
||||
#' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
|
||||
#' example, if `evals` contains an entry named "tr" and the metric is "rmse",
|
||||
#' this will be a one-element vector with name "tr-rmse".
|
||||
#'
|
||||
#' For \link{xgb.cv}, this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
|
||||
#' where the row names will follow the same naming logic as the one-dimensional vector
|
||||
#' that is passed in \link{xgb.train}.
|
||||
#'
|
||||
#' Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
|
||||
#' this table by calculating the row-wise means and standard deviations.
|
||||
#'
|
||||
#' \item final_feval The evaluation results after the last boosting round is executed
|
||||
#' (same format as `iter_feval`, and will be the exact same input as passed under
|
||||
#' `iter_feval` to the last round that is executed during model fitting).
|
||||
#'
|
||||
#' \item prev_cb_res Result from a previous run of a callback sharing the same name
|
||||
#' (as given by parameter `cb_name`) when conducting training continuation, if there
|
||||
#' was any in the booster R attributes.
|
||||
#'
|
||||
#' Some times, one might want to append the new results to the previous one, and this will
|
||||
#' be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
|
||||
#' which will append the new rows to the previous table.
|
||||
#'
|
||||
#' If no such previous callback result is available (which it never will when fitting
|
||||
#' a model from start instead of updating an existing model), this will be `NULL`.
|
||||
#'
|
||||
#' For \link{xgb.cv}, which doesn't support training continuation, this will always be `NULL`.
|
||||
#' }
|
||||
#'
|
||||
#' The following names (`cb_name` values) are reserved for internal callbacks:\itemize{
|
||||
#' \item print_evaluation
|
||||
#' \item evaluation_log
|
||||
#' \item reset_parameters
|
||||
#' \item early_stop
|
||||
#' \item save_model
|
||||
#' \item cv_predict
|
||||
#' \item gblinear_history
|
||||
#' }
|
||||
#'
|
||||
#' The following names are reserved for other non-callback attributes:\itemize{
|
||||
#' \item names
|
||||
#' \item class
|
||||
#' \item call
|
||||
#' \item params
|
||||
#' \item niter
|
||||
#' \item nfeatures
|
||||
#' \item folds
|
||||
#' }
|
||||
#'
|
||||
#' When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
|
||||
#' When using the built-in early stopping callback ([xgb.cb.early.stop]), said callback
|
||||
#' will always be executed before the others, as it sets some booster C-level attributes
|
||||
#' that other callbacks might also use. Otherwise, the order of execution will match with
|
||||
#' the order in which the callbacks are passed to the model fitting function.
|
||||
#' @seealso Built-in callbacks:\itemize{
|
||||
#' \item \link{xgb.cb.print.evaluation}
|
||||
#' \item \link{xgb.cb.evaluation.log}
|
||||
#' \item \link{xgb.cb.reset.parameters}
|
||||
#' \item \link{xgb.cb.early.stop}
|
||||
#' \item \link{xgb.cb.save.model}
|
||||
#' \item \link{xgb.cb.cv.predict}
|
||||
#' \item \link{xgb.cb.gblinear.history}
|
||||
#' }
|
||||
#'
|
||||
#' @param cb_name Name for the callback.
|
||||
#'
|
||||
#' If the callback produces some non-NULL result (from executing the function passed under
|
||||
#' `f_after_training`), that result will be added as an R attribute to the resulting booster
|
||||
#' (or as a named element in the result of CV), with the attribute name specified here.
|
||||
#'
|
||||
#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
|
||||
#' @param env An environment object that will be passed to the different functions in the callback.
|
||||
#' Note that this environment will not be shared with other callbacks.
|
||||
#' @param f_before_training A function that will be executed before the training has started.
|
||||
#'
|
||||
#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
|
||||
#'
|
||||
#' If passing a function, it will be called with parameters supplied as non-named arguments
|
||||
#' matching the function signatures that are shown in the default value for each function argument.
|
||||
#' @param f_before_iter A function that will be executed before each boosting round.
|
||||
#'
|
||||
#' This function can signal whether the training should be finalized or not, by outputting
|
||||
#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
|
||||
#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
|
||||
#'
|
||||
#' Return values of `NULL` will be interpreted as `FALSE`.
|
||||
#' @param f_after_iter A function that will be executed after each boosting round.
|
||||
#'
|
||||
#' This function can signal whether the training should be finalized or not, by outputting
|
||||
#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
|
||||
#' a given round is `TRUE`, then training will be stopped at that round.
|
||||
#'
|
||||
#' Return values of `NULL` will be interpreted as `FALSE`.
|
||||
#' @param f_after_training A function that will be executed after training is finished.
|
||||
#'
|
||||
#' This function can optionally output something non-NULL, which will become part of the R
|
||||
#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to [xgb.train()])
|
||||
#' under the name supplied for parameter `cb_name` imn the case of [xgb.train()]; or a part
|
||||
#' of the named elements in the result of [xgb.cv()].
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#'
|
||||
#' @seealso Built-in callbacks:
|
||||
#' - [xgb.cb.print.evaluation]
|
||||
#' - [xgb.cb.evaluation.log]
|
||||
#' - [xgb.cb.reset.parameters]
|
||||
#' - [xgb.cb.early.stop]
|
||||
#' - [xgb.cb.save.model]
|
||||
#' - [xgb.cb.cv.predict]
|
||||
#' - [xgb.cb.gblinear.history]
|
||||
#
|
||||
#' @examples
|
||||
#' # Example constructing a custom callback that calculates
|
||||
#' # squared error on the training data (no separate test set),
|
||||
@ -203,8 +197,10 @@
|
||||
#' )
|
||||
#'
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#'
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
#' model <- xgb.train(
|
||||
#' data = dm,
|
||||
@ -407,16 +403,18 @@ xgb.Callback <- function(
|
||||
return(paste0(iter, res))
|
||||
}
|
||||
|
||||
#' @title Callback for printing the result of evaluation
|
||||
#' @param period results would be printed every number of periods
|
||||
#' @param showsd whether standard deviations should be printed (when available)
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' Callback for printing the result of evaluation
|
||||
#'
|
||||
#' @description
|
||||
#' The callback function prints the result of evaluation at every \code{period} iterations.
|
||||
#' The callback function prints the result of evaluation at every `period` iterations.
|
||||
#' The initial and the last iteration's evaluations are always printed.
|
||||
#'
|
||||
#' Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
|
||||
#' @seealso \link{xgb.Callback}
|
||||
#' Does not leave any attribute in the booster (see [xgb.cb.evaluation.log] for that).
|
||||
#'
|
||||
#' @param period Results would be printed every number of periods.
|
||||
#' @param showsd Whether standard deviations should be printed (when available).
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#' @seealso [xgb.Callback]
|
||||
#' @export
|
||||
xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
|
||||
if (length(period) != 1 || period != floor(period) || period < 1) {
|
||||
@ -450,14 +448,16 @@ xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Callback for logging the evaluation history
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' Callback for logging the evaluation history
|
||||
#'
|
||||
#' @details This callback creates a table with per-iteration evaluation metrics (see parameters
|
||||
#' `evals` and `feval` in \link{xgb.train}).
|
||||
#' @details
|
||||
#' `evals` and `feval` in [xgb.train()]).
|
||||
#'
|
||||
#' Note: in the column names of the final data.table, the dash '-' character is replaced with
|
||||
#' the underscore '_' in order to make the column names more like regular R identifiers.
|
||||
#' @seealso \link{xgb.cb.print.evaluation}
|
||||
#'
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#' @seealso [xgb.cb.print.evaluation]
|
||||
#' @export
|
||||
xgb.cb.evaluation.log <- function() {
|
||||
xgb.Callback(
|
||||
@ -517,20 +517,22 @@ xgb.cb.evaluation.log <- function() {
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Callback for resetting the booster's parameters at each iteration.
|
||||
#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
|
||||
#' Each element's value must be either a vector of values of length \code{nrounds}
|
||||
#' to be set at each iteration,
|
||||
#' or a function of two parameters \code{learning_rates(iteration, nrounds)}
|
||||
#' which returns a new parameter value by using the current iteration number
|
||||
#' and the total number of boosting rounds.
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' Callback for resetting booster parameters at each iteration
|
||||
#'
|
||||
#' @details
|
||||
#' Note that when training is resumed from some previous model, and a function is used to
|
||||
#' reset a parameter value, the \code{nrounds} argument in this function would be the
|
||||
#' reset a parameter value, the `nrounds` argument in this function would be the
|
||||
#' the number of boosting rounds in the current training.
|
||||
#'
|
||||
#' Does not leave any attribute in the booster.
|
||||
#'
|
||||
#' @param new_params List of parameters needed to be reset.
|
||||
#' Each element's value must be either a vector of values of length `nrounds`
|
||||
#' to be set at each iteration,
|
||||
#' or a function of two parameters `learning_rates(iteration, nrounds)`
|
||||
#' which returns a new parameter value by using the current iteration number
|
||||
#' and the total number of boosting rounds.
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#' @export
|
||||
xgb.cb.reset.parameters <- function(new_params) {
|
||||
stopifnot(is.list(new_params))
|
||||
@ -583,39 +585,39 @@ xgb.cb.reset.parameters <- function(new_params) {
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Callback to activate early stopping
|
||||
#' @param stopping_rounds The number of rounds with no improvement in
|
||||
#' the evaluation metric in order to stop the training.
|
||||
#' @param maximize Whether to maximize the evaluation metric.
|
||||
#' @param metric_name The name of an evaluation column to use as a criteria for early
|
||||
#' stopping. If not set, the last column would be used.
|
||||
#' Let's say the test data in \code{evals} was labelled as \code{dtest},
|
||||
#' and one wants to use the AUC in test data for early stopping regardless of where
|
||||
#' it is in the \code{evals}, then one of the following would need to be set:
|
||||
#' \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
|
||||
#' All dash '-' characters in metric names are considered equivalent to '_'.
|
||||
#' @param verbose Whether to print the early stopping information.
|
||||
#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
|
||||
#' in the resulting object. If passing `FALSE`, will only keep the boosting rounds
|
||||
#' up to the detected best iteration, discarding the ones that come after.
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' Callback to activate early stopping
|
||||
#'
|
||||
#' @description
|
||||
#' This callback function determines the condition for early stopping.
|
||||
#'
|
||||
#' The following attributes are assigned to the booster's object:
|
||||
#' \itemize{
|
||||
#' \item \code{best_score} the evaluation score at the best iteration
|
||||
#' \item \code{best_iteration} at which boosting iteration the best score has occurred
|
||||
#' - `best_score` the evaluation score at the best iteration
|
||||
#' - `best_iteration` at which boosting iteration the best score has occurred
|
||||
#' (0-based index for interoperability of binary models)
|
||||
#' }
|
||||
#'
|
||||
#' The same values are also stored as R attributes as a result of the callback, plus an additional
|
||||
#' attribute `stopped_by_max_rounds` which indicates whether an early stopping by the `stopping_rounds`
|
||||
#' condition occurred. Note that the `best_iteration` that is stored under R attributes will follow
|
||||
#' base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
|
||||
#' through \link{xgb.attr} or \link{xgb.attributes}.
|
||||
#' through [xgb.attr()] or [xgb.attributes()].
|
||||
#'
|
||||
#' At least one dataset is required in `evals` for early stopping to work.
|
||||
#'
|
||||
#' @param stopping_rounds The number of rounds with no improvement in
|
||||
#' the evaluation metric in order to stop the training.
|
||||
#' @param maximize Whether to maximize the evaluation metric.
|
||||
#' @param metric_name The name of an evaluation column to use as a criteria for early
|
||||
#' stopping. If not set, the last column would be used.
|
||||
#' Let's say the test data in `evals` was labelled as `dtest`,
|
||||
#' and one wants to use the AUC in test data for early stopping regardless of where
|
||||
#' it is in the `evals`, then one of the following would need to be set:
|
||||
#' `metric_name = 'dtest-auc'` or `metric_name = 'dtest_auc'`.
|
||||
#' All dash '-' characters in metric names are considered equivalent to '_'.
|
||||
#' @param verbose Whether to print the early stopping information.
|
||||
#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
|
||||
#' in the resulting object. If passing `FALSE`, will only keep the boosting rounds
|
||||
#' up to the detected best iteration, discarding the ones that come after.
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#' @export
|
||||
xgb.cb.early.stop <- function(
|
||||
stopping_rounds,
|
||||
@ -771,21 +773,22 @@ xgb.cb.early.stop <- function(
|
||||
xgb.save(model, save_name)
|
||||
}
|
||||
|
||||
#' @title Callback for saving a model file.
|
||||
#' @param save_period Save the model to disk after every
|
||||
#' \code{save_period} iterations; 0 means save the model at the end.
|
||||
#' @param save_name The name or path for the saved model file.
|
||||
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
#' to include the integer iteration number in the file name.
|
||||
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.model".
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train},
|
||||
#' but \bold{not} to \link{xgb.cv}.
|
||||
#' Callback for saving a model file
|
||||
#'
|
||||
#' @description
|
||||
#' This callback function allows to save an xgb-model file, either periodically
|
||||
#' after each \code{save_period}'s or at the end.
|
||||
#' after each `save_period`'s or at the end.
|
||||
#'
|
||||
#' Does not leave any attribute in the booster.
|
||||
#'
|
||||
#' @param save_period Save the model to disk after every `save_period` iterations;
|
||||
#' 0 means save the model at the end.
|
||||
#' @param save_name The name or path for the saved model file.
|
||||
#' It can contain a [sprintf()] formatting specifier to include the integer
|
||||
#' iteration number in the file name. E.g., with `save_name = 'xgboost_%04d.model'`,
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.model".
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()],
|
||||
#' but **not** to [xgb.cv()].
|
||||
#' @export
|
||||
xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
|
||||
if (save_period < 0) {
|
||||
@ -817,24 +820,26 @@ xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Callback for returning cross-validation based predictions.
|
||||
#' @param save_models A flag for whether to save the folds' models.
|
||||
#' @param outputmargin Whether to save margin predictions (same effect as passing this
|
||||
#' parameter to \link{predict.xgb.Booster}).
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.cv},
|
||||
#' but \bold{not} to \link{xgb.train}.
|
||||
#' @description
|
||||
#' Callback for returning cross-validation based predictions
|
||||
#'
|
||||
#' This callback function saves predictions for all of the test folds,
|
||||
#' and also allows to save the folds' models.
|
||||
#'
|
||||
#' @details
|
||||
#' Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
|
||||
#' Predictions are saved inside of the `pred` element, which is either a vector or a matrix,
|
||||
#' depending on the number of prediction outputs per data row. The order of predictions corresponds
|
||||
#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
|
||||
#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
|
||||
#' to the order of rows in the original dataset. Note that when a custom `folds` list is
|
||||
#' provided in [xgb.cv()], the predictions would only be returned properly when this list is a
|
||||
#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
|
||||
#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
|
||||
#' When some of the indices in the training dataset are not included into user-provided \code{folds},
|
||||
#' their prediction value would be \code{NA}.
|
||||
#' When some of the indices in the training dataset are not included into user-provided `folds`,
|
||||
#' their prediction value would be `NA`.
|
||||
#'
|
||||
#' @param save_models A flag for whether to save the folds' models.
|
||||
#' @param outputmargin Whether to save margin predictions (same effect as passing this
|
||||
#' parameter to [predict.xgb.Booster]).
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.cv()],
|
||||
#' but **not** to [xgb.train()].
|
||||
#' @export
|
||||
xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
|
||||
xgb.Callback(
|
||||
@ -853,8 +858,7 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
|
||||
pr <- predict(
|
||||
fd$bst,
|
||||
fd$evals[[2L]],
|
||||
outputmargin = env$outputmargin,
|
||||
reshape = TRUE
|
||||
outputmargin = env$outputmargin
|
||||
)
|
||||
if (is.null(pred)) {
|
||||
if (NCOL(pr) > 1L) {
|
||||
@ -904,19 +908,15 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
|
||||
return(coefs)
|
||||
}
|
||||
|
||||
#' @title Callback for collecting coefficients history of a gblinear booster
|
||||
#' @param sparse when set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
|
||||
#' Sparse format is useful when one expects only a subset of coefficients to be non-zero,
|
||||
#' when using the "thrifty" feature selector with fairly small number of top features
|
||||
#' selected per iteration.
|
||||
#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
#' Callback for collecting coefficients history of a gblinear booster
|
||||
#'
|
||||
#' @details
|
||||
#' To keep things fast and simple, gblinear booster does not internally store the history of linear
|
||||
#' model coefficients at each boosting iteration. This callback provides a workaround for storing
|
||||
#' the coefficients' path, by extracting them after each training iteration.
|
||||
#'
|
||||
#' This callback will construct a matrix where rows are boosting iterations and columns are
|
||||
#' feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
|
||||
#' feature coefficients (same order as when calling [coef.xgb.Booster], with the intercept
|
||||
#' corresponding to the first column).
|
||||
#'
|
||||
#' When there is more than one coefficient per feature (e.g. multi-class classification),
|
||||
@ -929,13 +929,18 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
|
||||
#' one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
|
||||
#' (so e.g. column 'c1' for class '0' will be named 'c1:0').
|
||||
#'
|
||||
#' With \code{xgb.train}, the output is either a dense or a sparse matrix.
|
||||
#' With with \code{xgb.cv}, it is a list (one element per each fold) of such
|
||||
#' matrices.
|
||||
#' With [xgb.train()], the output is either a dense or a sparse matrix.
|
||||
#' With with [xgb.cv()], it is a list (one element per each fold) of such matrices.
|
||||
#'
|
||||
#' Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
|
||||
#' Function [xgb.gblinear.history] provides an easy way to retrieve the
|
||||
#' outputs from this callback.
|
||||
#' @seealso \link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
|
||||
#'
|
||||
#' @param sparse When set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
|
||||
#' Sparse format is useful when one expects only a subset of coefficients to be non-zero,
|
||||
#' when using the "thrifty" feature selector with fairly small number of top features
|
||||
#' selected per iteration.
|
||||
#' @return An `xgb.Callback` object, which can be passed to [xgb.train()] or [xgb.cv()].
|
||||
#' @seealso [xgb.gblinear.history], [coef.xgb.Booster].
|
||||
#' @examples
|
||||
#' #### Binary classification:
|
||||
#'
|
||||
@ -945,57 +950,109 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
|
||||
#'
|
||||
#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
|
||||
#' # without considering the 2nd order interactions:
|
||||
#' x <- model.matrix(Species ~ .^2, iris)[,-1]
|
||||
#' x <- model.matrix(Species ~ .^2, iris)[, -1]
|
||||
#' colnames(x)
|
||||
#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
|
||||
#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
|
||||
#' lambda = 0.0003, alpha = 0.0003, nthread = nthread)
|
||||
#' dtrain <- xgb.DMatrix(
|
||||
#' scale(x),
|
||||
#' label = 1 * (iris$Species == "versicolor"),
|
||||
#' nthread = nthread
|
||||
#' )
|
||||
#' param <- list(
|
||||
#' booster = "gblinear",
|
||||
#' objective = "reg:logistic",
|
||||
#' eval_metric = "auc",
|
||||
#' lambda = 0.0003,
|
||||
#' alpha = 0.0003,
|
||||
#' nthread = nthread
|
||||
#' )
|
||||
#'
|
||||
#' # For 'shotgun', which is a default linear updater, using high eta values may result in
|
||||
#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
|
||||
#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
|
||||
#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
|
||||
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
|
||||
#' callbacks = list(xgb.cb.gblinear.history()))
|
||||
#' bst <- xgb.train(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' list(tr = dtrain),
|
||||
#' nrounds = 200,
|
||||
#' eta = 1.,
|
||||
#' callbacks = list(xgb.cb.gblinear.history())
|
||||
#' )
|
||||
#'
|
||||
#' # Extract the coefficients' path and plot them vs boosting iteration number:
|
||||
#' coef_path <- xgb.gblinear.history(bst)
|
||||
#' matplot(coef_path, type = 'l')
|
||||
#' matplot(coef_path, type = "l")
|
||||
#'
|
||||
#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
|
||||
#' # Will try the classical componentwise boosting which selects a single best feature per round:
|
||||
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
|
||||
#' updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
|
||||
#' callbacks = list(xgb.cb.gblinear.history()))
|
||||
#' matplot(xgb.gblinear.history(bst), type = 'l')
|
||||
#' bst <- xgb.train(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' list(tr = dtrain),
|
||||
#' nrounds = 200,
|
||||
#' eta = 0.8,
|
||||
#' updater = "coord_descent",
|
||||
#' feature_selector = "thrifty",
|
||||
#' top_k = 1,
|
||||
#' callbacks = list(xgb.cb.gblinear.history())
|
||||
#' )
|
||||
#' matplot(xgb.gblinear.history(bst), type = "l")
|
||||
#' # Componentwise boosting is known to have similar effect to Lasso regularization.
|
||||
#' # Try experimenting with various values of top_k, eta, nrounds,
|
||||
#' # as well as different feature_selectors.
|
||||
#'
|
||||
#' # For xgb.cv:
|
||||
#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
|
||||
#' callbacks = list(xgb.cb.gblinear.history()))
|
||||
#' bst <- xgb.cv(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' nfold = 5,
|
||||
#' nrounds = 100,
|
||||
#' eta = 0.8,
|
||||
#' callbacks = list(xgb.cb.gblinear.history())
|
||||
#' )
|
||||
#' # coefficients in the CV fold #3
|
||||
#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
|
||||
#' matplot(xgb.gblinear.history(bst)[[3]], type = "l")
|
||||
#'
|
||||
#'
|
||||
#' #### Multiclass classification:
|
||||
#' #
|
||||
#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
|
||||
#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
||||
#' lambda = 0.0003, alpha = 0.0003, nthread = nthread)
|
||||
#'
|
||||
#' param <- list(
|
||||
#' booster = "gblinear",
|
||||
#' objective = "multi:softprob",
|
||||
#' num_class = 3,
|
||||
#' lambda = 0.0003,
|
||||
#' alpha = 0.0003,
|
||||
#' nthread = nthread
|
||||
#' )
|
||||
#'
|
||||
#' # For the default linear updater 'shotgun' it sometimes is helpful
|
||||
#' # to use smaller eta to reduce instability
|
||||
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
|
||||
#' callbacks = list(xgb.cb.gblinear.history()))
|
||||
#' bst <- xgb.train(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' list(tr = dtrain),
|
||||
#' nrounds = 50,
|
||||
#' eta = 0.5,
|
||||
#' callbacks = list(xgb.cb.gblinear.history())
|
||||
#' )
|
||||
#'
|
||||
#' # Will plot the coefficient paths separately for each class:
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 0), type = "l")
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 1), type = "l")
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 2), type = "l")
|
||||
#'
|
||||
#' # CV:
|
||||
#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
|
||||
#' callbacks = list(xgb.cb.gblinear.history(FALSE)))
|
||||
#' bst <- xgb.cv(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' nfold = 5,
|
||||
#' nrounds = 70,
|
||||
#' eta = 0.5,
|
||||
#' callbacks = list(xgb.cb.gblinear.history(FALSE))
|
||||
#' )
|
||||
#' # 1st fold of 1st class
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = "l")
|
||||
#'
|
||||
#' @export
|
||||
xgb.cb.gblinear.history <- function(sparse = FALSE) {
|
||||
@ -1098,28 +1155,31 @@ xgb.cb.gblinear.history <- function(sparse = FALSE) {
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Extract gblinear coefficients history.
|
||||
#' @description A helper function to extract the matrix of linear coefficients' history
|
||||
#' from a gblinear model created while using the \link{xgb.cb.gblinear.history}
|
||||
#' callback (which must be added manually as by default it's not used).
|
||||
#' @details Note that this is an R-specific function that relies on R attributes that
|
||||
#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
#' or \link{xgb.load.raw}.
|
||||
#' Extract gblinear coefficients history
|
||||
#'
|
||||
#' A helper function to extract the matrix of linear coefficients' history
|
||||
#' from a gblinear model created while using the [xgb.cb.gblinear.history]
|
||||
#' callback (which must be added manually as by default it is not used).
|
||||
#'
|
||||
#' @details
|
||||
#' Note that this is an R-specific function that relies on R attributes that
|
||||
#' are not saved when using XGBoost's own serialization functions like [xgb.load()]
|
||||
#' or [xgb.load.raw()].
|
||||
#'
|
||||
#' In order for a serialized model to be accepted by this function, one must use R
|
||||
#' serializers such as \link{saveRDS}.
|
||||
#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
#' using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
|
||||
#' loaded from \link{xgb.load} or \link{xgb.load.raw}.
|
||||
#' serializers such as [saveRDS()].
|
||||
#' @param model Either an `xgb.Booster` or a result of [xgb.cv()], trained
|
||||
#' using the [xgb.cb.gblinear.history] callback, but **not** a booster
|
||||
#' loaded from [xgb.load()] or [xgb.load.raw()].
|
||||
#' @param class_index zero-based class index to extract the coefficients for only that
|
||||
#' specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
#' coefficients are returned. Has no effect in non-multiclass models.
|
||||
#' specific class in a multinomial multiclass model. When it is `NULL`, all the
|
||||
#' coefficients are returned. Has no effect in non-multiclass models.
|
||||
#'
|
||||
#' @return
|
||||
#' For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
|
||||
#' For an [xgb.train()] result, a matrix (either dense or sparse) with the columns
|
||||
#' corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
|
||||
#'
|
||||
#' For an \link{xgb.cv} result, a list of such matrices is returned with the elements
|
||||
#' For an [xgb.cv()] result, a list of such matrices is returned with the elements
|
||||
#' corresponding to CV folds.
|
||||
#'
|
||||
#' When there is more than one coefficient per feature (e.g. multi-class classification)
|
||||
@ -1127,7 +1187,7 @@ xgb.cb.gblinear.history <- function(sparse = FALSE) {
|
||||
#' the result will be reshaped into a vector where coefficients are arranged first by features and
|
||||
#' then by class (e.g. first 1 through N coefficients will be for the first class, then
|
||||
#' coefficients N+1 through 2N for the second class, and so on).
|
||||
#' @seealso \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
|
||||
#' @seealso [xgb.cb.gblinear.history], [coef.xgb.Booster].
|
||||
#' @export
|
||||
xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
|
||||
|
||||
@ -27,8 +27,41 @@ NVL <- function(x, val) {
|
||||
}
|
||||
|
||||
.RANKING_OBJECTIVES <- function() {
|
||||
return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
|
||||
'multi:softprob'))
|
||||
return(c('rank:pairwise', 'rank:ndcg', 'rank:map'))
|
||||
}
|
||||
|
||||
.OBJECTIVES_NON_DEFAULT_MODE <- function() {
|
||||
return(c("reg:logistic", "binary:logitraw", "multi:softmax"))
|
||||
}
|
||||
|
||||
.BINARY_CLASSIF_OBJECTIVES <- function() {
|
||||
return(c("binary:logistic", "binary:hinge"))
|
||||
}
|
||||
|
||||
.MULTICLASS_CLASSIF_OBJECTIVES <- function() {
|
||||
return("multi:softprob")
|
||||
}
|
||||
|
||||
.SURVIVAL_RIGHT_CENSORING_OBJECTIVES <- function() { # nolint
|
||||
return(c("survival:cox", "survival:aft"))
|
||||
}
|
||||
|
||||
.SURVIVAL_ALL_CENSORING_OBJECTIVES <- function() { # nolint
|
||||
return("survival:aft")
|
||||
}
|
||||
|
||||
.REGRESSION_OBJECTIVES <- function() {
|
||||
return(c(
|
||||
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
|
||||
"reg:absoluteerror", "reg:quantileerror", "count:poisson", "reg:gamma", "reg:tweedie"
|
||||
))
|
||||
}
|
||||
|
||||
.MULTI_TARGET_OBJECTIVES <- function() {
|
||||
return(c(
|
||||
"reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror",
|
||||
"reg:quantileerror", "reg:gamma"
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@ -71,7 +104,7 @@ check.booster.params <- function(params, ...) {
|
||||
|
||||
# for multiclass, expect num_class to be set
|
||||
if (typeof(params[['objective']]) == "character" &&
|
||||
substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:' &&
|
||||
startsWith(NVL(params[['objective']], 'x'), 'multi:') &&
|
||||
as.numeric(NVL(params[['num_class']], 0)) < 2) {
|
||||
stop("'num_class' > 1 parameter must be set for multiclass classification")
|
||||
}
|
||||
@ -166,8 +199,7 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) {
|
||||
bst,
|
||||
dtrain,
|
||||
outputmargin = TRUE,
|
||||
training = TRUE,
|
||||
reshape = TRUE
|
||||
training = TRUE
|
||||
)
|
||||
gpair <- obj(pred, dtrain)
|
||||
n_samples <- dim(dtrain)[1]
|
||||
@ -378,7 +410,7 @@ xgb.createFolds <- function(y, k) {
|
||||
#' At this time, some of the parameter names were changed in order to make the code style more uniform.
|
||||
#' The deprecated parameters would be removed in the next release.
|
||||
#'
|
||||
#' To see all the current deprecated and new parameters, check the \code{xgboost:::depr_par_lut} table.
|
||||
#' To see all the current deprecated and new parameters, check the `xgboost:::depr_par_lut` table.
|
||||
#'
|
||||
#' A deprecation warning is shown when any of the deprecated parameters is used in a call.
|
||||
#' An additional warning is shown when there was a partial match to a deprecated parameter
|
||||
@ -387,70 +419,90 @@ xgb.createFolds <- function(y, k) {
|
||||
#' @name xgboost-deprecated
|
||||
NULL
|
||||
|
||||
#' @title Model Serialization and Compatibility
|
||||
#' @description
|
||||
#' Model Serialization and Compatibility
|
||||
#'
|
||||
#' @description
|
||||
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides
|
||||
#' its own serializers with better compatibility guarantees, which allow loading
|
||||
#' said models in other language bindings of XGBoost.
|
||||
#'
|
||||
#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
|
||||
#' \item Additional model configuration (accessible through \link{xgb.config}),
|
||||
#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
|
||||
#' These are not necessarily useful for prediction/importance/plotting.
|
||||
#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
|
||||
#' if present.
|
||||
#' }
|
||||
#' Note that an `xgb.Booster` object (**as produced by [xgb.train()]**, see rest of the doc
|
||||
#' for objects produced by [xgboost()]), outside of its core components, might also keep:
|
||||
#' - Additional model configuration (accessible through [xgb.config()]), which includes
|
||||
#' model fitting parameters like `max_depth` and runtime parameters like `nthread`.
|
||||
#' These are not necessarily useful for prediction/importance/plotting.
|
||||
#' - Additional R specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
#' which are kept as a `data.table` object, accessible through
|
||||
#' `attributes(model)$evaluation_log` if present.
|
||||
#'
|
||||
#' The first one (configurations) does not have the same compatibility guarantees as
|
||||
#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
#' might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
#' the model itself, including attributes that are set and accessed through
|
||||
#' [xgb.attributes()] - that is, such configuration might be lost after loading the
|
||||
#' booster in a different XGBoost version, regardless of the serializer that was used.
|
||||
#' These are saved when using [saveRDS()], but will be discarded if loaded into an
|
||||
#' incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
#' serializers from its public interface including [xgb.save()] and [xgb.save.raw()].
|
||||
#'
|
||||
#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
#' not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
#' The second ones (R attributes) are not part of the standard XGBoost model structure,
|
||||
#' and thus are not saved when using XGBoost's own serializers. These attributes are
|
||||
#' only used for informational purposes, such as keeping track of evaluation metrics as
|
||||
#' the model was fit, or saving the R call that produced the model, but are otherwise
|
||||
#' not used for prediction / importance / plotting / etc.
|
||||
#' These R attributes are only preserved when using R's serializers.
|
||||
#'
|
||||
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
|
||||
#' before version `2.1.0`; have a very different R object structure and are incompatible with
|
||||
#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
|
||||
#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
|
||||
#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the
|
||||
#' function [xgboost()] produces a different subclass `xgboost`, which keeps other
|
||||
#' additional metadata as R attributes such as class names in classification problems,
|
||||
#' and which has a dedicated `predict` method that uses different defaults. XGBoost's
|
||||
#' own serializers can work with this `xgboost` class, but as they do not keep R
|
||||
#' attributes, the resulting object, when deserialized, is downcasted to the regular
|
||||
#' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use
|
||||
#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects,
|
||||
#' `saveRDS` might thus be a better option if the extra functionalities are needed.
|
||||
#'
|
||||
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and
|
||||
#' XGBoost models before version `2.1.0`; have a very different R object structure and
|
||||
#' are incompatible with each other. Hence, models that were saved with R serializers
|
||||
#' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter
|
||||
#' `xgboost` versions and vice versa. Be aware that the structure of R model objects
|
||||
#' could in theory change again in the future, so XGBoost's serializers
|
||||
#' should be preferred for long-term storage.
|
||||
#'
|
||||
#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
|
||||
#' higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
#' Furthermore, note that using the package `qs` for serialization will require
|
||||
#' version 0.26 or higher of said package, and will have the same compatibility
|
||||
#' restrictions as R serializers.
|
||||
#'
|
||||
#' @details
|
||||
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
#' Use [xgb.save()] to save the XGBoost model as a stand-alone file. You may opt into
|
||||
#' the JSON format by specifying the JSON extension. To read the model back, use
|
||||
#' \code{\link{xgb.load}}.
|
||||
#' [xgb.load()].
|
||||
#'
|
||||
#' Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
#' Use [xgb.save.raw()] to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
#' in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
|
||||
#' re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
|
||||
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
#' re-construct the corresponding model. To read the model back, use [xgb.load.raw()].
|
||||
#' The [xgb.save.raw()] function is useful if you would like to persist the XGBoost model
|
||||
#' as part of another R object.
|
||||
#'
|
||||
#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
#' control as it relies on R's serialization format (see e.g. the details section in
|
||||
#' \link{serialize} and \link{save} from base R).
|
||||
#' Use [saveRDS()] if you require the R-specific attributes that a booster might have, such
|
||||
#' as evaluation logs or the model class `xgboost` instead of `xgb.Booster`, but note that
|
||||
#' future compatibility of such objects is outside XGBoost's control as it relies on R's
|
||||
#' serialization format (see e.g. the details section in [serialize] and [save()] from base R).
|
||||
#'
|
||||
#' For more details and explanation about model persistence and archival, consult the page
|
||||
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
|
||||
#' objective = "binary:logistic")
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' # Save as a stand-alone file; load it with xgb.load()
|
||||
#' fname <- file.path(tempdir(), "xgb_model.ubj")
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Construct an internal xgboost Booster and get its current number of rounds.
|
||||
# Construct an internal XGBoost Booster and get its current number of rounds.
|
||||
# internal utility function
|
||||
# Note: the number of rounds in the C booster gets reset to zero when changing
|
||||
# key booster parameters like 'process_type=update', but in some cases, when
|
||||
@ -64,7 +64,7 @@ xgb.get.handle <- function(object) {
|
||||
if (inherits(object, "xgb.Booster")) {
|
||||
handle <- object$ptr
|
||||
if (is.null(handle) || !inherits(handle, "externalptr")) {
|
||||
stop("'xgb.Booster' object is corrupted or is from an incompatible xgboost version.")
|
||||
stop("'xgb.Booster' object is corrupted or is from an incompatible XGBoost version.")
|
||||
}
|
||||
} else {
|
||||
stop("argument must be an 'xgb.Booster' object.")
|
||||
@ -77,84 +77,96 @@ xgb.get.handle <- function(object) {
|
||||
|
||||
#' Predict method for XGBoost model
|
||||
#'
|
||||
#' Predict values on data based on xgboost model.
|
||||
#' Predict values on data based on XGBoost model.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`.
|
||||
#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
|
||||
#' local data file, or `xgb.DMatrix`.
|
||||
#' local data file, or `xgb.DMatrix`.
|
||||
#'
|
||||
#' For single-row predictions on sparse data, it's recommended to use CSR format. If passing
|
||||
#' a sparse vector, it will take it as a row vector.
|
||||
#' For single-row predictions on sparse data, it is recommended to use CSR format. If passing
|
||||
#' a sparse vector, it will take it as a row vector.
|
||||
#'
|
||||
#' Note that, for repeated predictions on the same data, one might want to create a DMatrix to
|
||||
#' pass here instead of passing R types like matrices or data frames, as predictions will be
|
||||
#' faster on DMatrix.
|
||||
#' Note that, for repeated predictions on the same data, one might want to create a DMatrix to
|
||||
#' pass here instead of passing R types like matrices or data frames, as predictions will be
|
||||
#' faster on DMatrix.
|
||||
#'
|
||||
#' If `newdata` is a `data.frame`, be aware that:\itemize{
|
||||
#' \item Columns will be converted to numeric if they aren't already, which could potentially make
|
||||
#' the operation slower than in an equivalent `matrix` object.
|
||||
#' \item The order of the columns must match with that of the data from which the model was fitted
|
||||
#' (i.e. columns will not be referenced by their names, just by their order in the data).
|
||||
#' \item If the model was fitted to data with categorical columns, these columns must be of
|
||||
#' `factor` type here, and must use the same encoding (i.e. have the same levels).
|
||||
#' \item If `newdata` contains any `factor` columns, they will be converted to base-0
|
||||
#' encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
|
||||
#' under a column which during training had a different type.
|
||||
#' }
|
||||
#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
|
||||
#' If `newdata` is a `data.frame`, be aware that:
|
||||
#' - Columns will be converted to numeric if they aren't already, which could potentially make
|
||||
#' the operation slower than in an equivalent `matrix` object.
|
||||
#' - The order of the columns must match with that of the data from which the model was fitted
|
||||
#' (i.e. columns will not be referenced by their names, just by their order in the data).
|
||||
#' - If the model was fitted to data with categorical columns, these columns must be of
|
||||
#' `factor` type here, and must use the same encoding (i.e. have the same levels).
|
||||
#' - If `newdata` contains any `factor` columns, they will be converted to base-0
|
||||
#' encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
|
||||
#' under a column which during training had a different type.
|
||||
#' @param missing Float value that represents missing values in data
|
||||
#' (e.g., 0 or some other extreme value).
|
||||
#'
|
||||
#' This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
|
||||
#' this as an argument to the DMatrix constructor instead.
|
||||
#' @param outputmargin Whether the prediction should be returned in the form of original untransformed
|
||||
#' sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
|
||||
#' logistic regression would return log-odds instead of probabilities.
|
||||
#' This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases,
|
||||
#' should pass this as an argument to the DMatrix constructor instead.
|
||||
#' @param outputmargin Whether the prediction should be returned in the form of
|
||||
#' original untransformed sum of predictions from boosting iterations' results.
|
||||
#' E.g., setting `outputmargin = TRUE` for logistic regression would return log-odds
|
||||
#' instead of probabilities.
|
||||
#' @param predleaf Whether to predict per-tree leaf indices.
|
||||
#' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
|
||||
#' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
|
||||
#' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
|
||||
#' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
|
||||
#' prediction outputs per case. No effect if `predleaf`, `predcontrib`,
|
||||
#' or `predinteraction` is `TRUE`.
|
||||
#' @param training Whether the prediction result is used for training. For dart booster,
|
||||
#' training predicting will perform dropout.
|
||||
#' training predicting will perform dropout.
|
||||
#' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
|
||||
#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
|
||||
#' base-1 indexing, and inclusive of both ends).
|
||||
#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
|
||||
#' base-1 indexing, and inclusive of both ends).
|
||||
#'
|
||||
#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
|
||||
#' predict using only the first one.
|
||||
#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
|
||||
#' predict using only the first one.
|
||||
#'
|
||||
#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
|
||||
#' of the iterations (rounds) otherwise.
|
||||
#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
|
||||
#' of the iterations (rounds) otherwise.
|
||||
#'
|
||||
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
|
||||
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
|
||||
#' type and shape of predictions are invariant to the model type.
|
||||
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
|
||||
#' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode
|
||||
#' regardless of the model type - meaning that, for example, both a multi-class and a binary classification
|
||||
#' model would generate output arrays with the same number of dimensions, with the 'class' dimension having
|
||||
#' size equal to '1' for the binary model.
|
||||
#'
|
||||
#' If passing `FALSE` (the default), dimensions will be simplified according to the model type, so that a
|
||||
#' binary classification model for example would not have a redundant dimension for 'class'.
|
||||
#'
|
||||
#' See documentation for the return type for the exact shape of the output arrays for each prediction mode.
|
||||
#' @param avoid_transpose Whether to output the resulting predictions in the same memory layout in which they
|
||||
#' are generated by the core XGBoost library, without transposing them to match the expected output shape.
|
||||
#'
|
||||
#' Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
|
||||
#' order, hence the result needs to be transposed in order to have the expected shape when represented as
|
||||
#' an R array or matrix, which might be a slow operation.
|
||||
#'
|
||||
#' If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
|
||||
#' will be the last dimensions instead of the first dimension.
|
||||
#' @param base_margin Base margin used for boosting from existing model.
|
||||
#'
|
||||
#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
|
||||
#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
|
||||
#' an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
|
||||
#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
|
||||
#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
|
||||
#' an argument in its constructor, or by calling [setinfo.xgb.DMatrix()].
|
||||
#' @param validate_features When `TRUE`, validate that the Booster's and newdata's
|
||||
#' feature_names match (only applicable when both `object` and `newdata` have feature names).
|
||||
#'
|
||||
#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
|
||||
#' match (only applicable when both `object` and `newdata` have feature names).
|
||||
#' If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
|
||||
#' the columns in `newdata` to match with the booster's.
|
||||
#'
|
||||
#' If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
|
||||
#' the columns in `newdata` to match with the booster's.
|
||||
#' If the booster has feature types and `newdata` is either an `xgb.DMatrix` or
|
||||
#' `data.frame`, will additionally verify that categorical columns are of the
|
||||
#' correct type in `newdata`, throwing an error if they do not match.
|
||||
#'
|
||||
#' If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
|
||||
#' will additionally verify that categorical columns are of the correct type in `newdata`,
|
||||
#' throwing an error if they do not match.
|
||||
#' If passing `FALSE`, it is assumed that the feature names and types are the same,
|
||||
#' and come in the same order as in the training data.
|
||||
#'
|
||||
#' If passing `FALSE`, it is assumed that the feature names and types are the same,
|
||||
#' and come in the same order as in the training data.
|
||||
#'
|
||||
#' Note that this check might add some sizable latency to the predictions, so it's
|
||||
#' recommended to disable it for performance-sensitive applications.
|
||||
#' Note that this check might add some sizable latency to the predictions, so it's
|
||||
#' recommended to disable it for performance-sensitive applications.
|
||||
#' @param ... Not used.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' Note that `iterationrange` would currently do nothing for predictions from "gblinear",
|
||||
#' since "gblinear" doesn't keep its boosting history.
|
||||
#'
|
||||
@ -180,28 +192,45 @@ xgb.get.handle <- function(object) {
|
||||
#' Note that converting a matrix to [xgb.DMatrix()] uses multiple threads too.
|
||||
#'
|
||||
#' @return
|
||||
#' The return type depends on `strict_shape`. If `FALSE` (default):
|
||||
#' - For regression or binary classification: A vector of length `nrows(newdata)`.
|
||||
#' - For multiclass classification: A vector of length `num_class * nrows(newdata)` or
|
||||
#' a `(nrows(newdata), num_class)` matrix, depending on the `reshape` value.
|
||||
#' - When `predleaf = TRUE`: A matrix with one column per tree.
|
||||
#' - When `predcontrib = TRUE`: When not multiclass, a matrix with
|
||||
#' ` num_features + 1` columns. The last "+ 1" column corresponds to the baseline value.
|
||||
#' In the multiclass case, a list of `num_class` such matrices.
|
||||
#' The contribution values are on the scale of untransformed margin
|
||||
#' (e.g., for binary classification, the values are log-odds deviations from the baseline).
|
||||
#' - When `predinteraction = TRUE`: When not multiclass, the output is a 3d array of
|
||||
#' dimension `c(nrow, num_features + 1, num_features + 1)`. The off-diagonal (in the last two dimensions)
|
||||
#' elements represent different feature interaction contributions. The array is symmetric WRT the last
|
||||
#' two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last dimension should
|
||||
#' produce practically the same result as `predcontrib = TRUE`.
|
||||
#' In the multiclass case, a list of `num_class` such arrays.
|
||||
#' A numeric vector or array, with corresponding dimensions depending on the prediction mode and on
|
||||
#' parameter `strict_shape` as follows:
|
||||
#'
|
||||
#' When `strict_shape = TRUE`, the output is always an array:
|
||||
#' - For normal predictions, the output has dimension `(num_class, nrow(newdata))`.
|
||||
#' - For `predcontrib = TRUE`, the dimension is `(ncol(newdata) + 1, num_class, nrow(newdata))`.
|
||||
#' - For `predinteraction = TRUE`, the dimension is `(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))`.
|
||||
#' - For `predleaf = TRUE`, the dimension is `(n_trees_in_forest, num_class, n_iterations, nrow(newdata))`.
|
||||
#' If passing `strict_shape=FALSE`:\itemize{
|
||||
#' \item For regression or binary classification: a vector of length `nrows`.
|
||||
#' \item For multi-class and multi-target objectives: a matrix of dimensions `[nrows, ngroups]`.
|
||||
#'
|
||||
#' Note that objective variant `multi:softmax` defaults towards predicting most likely class (a vector
|
||||
#' `nrows`) instead of per-class probabilities.
|
||||
#' \item For `predleaf`: a matrix with one column per tree.
|
||||
#'
|
||||
#' For multi-class / multi-target, they will be arranged so that columns in the output will have
|
||||
#' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`,
|
||||
#' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...).
|
||||
#' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions
|
||||
#' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value.
|
||||
#'
|
||||
#' For multi-class and multi-target objectives, will be an array with dimensions `[nrows, ngroups, nfeats+1]`.
|
||||
#'
|
||||
#' The contribution values are on the scale of untransformed margin (e.g., for binary classification,
|
||||
#' the values are log-odds deviations from the baseline).
|
||||
#' \item For `predinteraction`: when not multi-class / multi-target, the output is a 3D array of
|
||||
#' dimensions `[nrows, nfeats+1, nfeats+1]`. The off-diagonal (in the last two dimensions)
|
||||
#' elements represent different feature interaction contributions. The array is symmetric w.r.t. the last
|
||||
#' two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last
|
||||
#' dimension should produce practically the same result as `predcontrib = TRUE`.
|
||||
#'
|
||||
#' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]`
|
||||
#' }
|
||||
#'
|
||||
#' If passing `strict_shape=FALSE`, the result is always an array:
|
||||
#' - For normal predictions, the dimension is `[nrows, ngroups]`.
|
||||
#' - For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`.
|
||||
#' - For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`.
|
||||
#' - For `predleaf=TRUE`, the dimension is `[nrows, niter, ngroups, num_parallel_tree]`.
|
||||
#'
|
||||
#' If passing `avoid_transpose=TRUE`, then the dimensions in all cases will be in reverse order - for
|
||||
#' example, for `predinteraction`, they will be `[nfeats+1, nfeats+1, ngroups, nrows]`
|
||||
#' instead of `[nrows, ngroups, nfeats+1, nfeats+1]`.
|
||||
#' @seealso [xgb.train()]
|
||||
#' @references
|
||||
#' 1. Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
||||
@ -249,7 +278,7 @@ xgb.get.handle <- function(object) {
|
||||
#' summary(rowSums(pred_contr) - qlogis(pred))
|
||||
#' # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
|
||||
#' contr1 <- pred_contr[1,]
|
||||
#' contr1 <- contr1[-length(contr1)] # drop BIAS
|
||||
#' contr1 <- contr1[-length(contr1)] # drop intercept
|
||||
#' contr1 <- contr1[contr1 != 0] # drop non-contributing features
|
||||
#' contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
|
||||
#' old_mar <- par("mar")
|
||||
@ -279,8 +308,6 @@ xgb.get.handle <- function(object) {
|
||||
#' # predict for softmax returns num_class probability numbers per case:
|
||||
#' pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
#' str(pred)
|
||||
#' # reshape it to a num_class-columns matrix
|
||||
#' pred <- matrix(pred, ncol = num_class, byrow = TRUE)
|
||||
#' # convert the probabilities to softmax labels
|
||||
#' pred_labels <- max.col(pred) - 1
|
||||
#' # the following should result in the same error as seen in the last iteration
|
||||
@ -311,8 +338,11 @@ xgb.get.handle <- function(object) {
|
||||
#' @export
|
||||
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
|
||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
|
||||
training = FALSE, iterationrange = NULL, strict_shape = FALSE, avoid_transpose = FALSE,
|
||||
validate_features = FALSE, base_margin = NULL, ...) {
|
||||
if (NROW(list(...))) {
|
||||
warning("Passed unused prediction arguments: ", paste(names(list(...)), collapse = ", "), ".")
|
||||
}
|
||||
if (validate_features) {
|
||||
newdata <- validate.features(object, newdata)
|
||||
}
|
||||
@ -323,6 +353,11 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
" Should be passed as argument to 'xgb.DMatrix' constructor."
|
||||
)
|
||||
}
|
||||
if (is_dmatrix) {
|
||||
rnames <- NULL
|
||||
} else {
|
||||
rnames <- row.names(newdata)
|
||||
}
|
||||
|
||||
use_as_df <- FALSE
|
||||
use_as_dense_matrix <- FALSE
|
||||
@ -415,10 +450,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
return(val)
|
||||
}
|
||||
|
||||
## We set strict_shape to TRUE then drop the dimensions conditionally
|
||||
args <- list(
|
||||
training = box(training),
|
||||
strict_shape = box(TRUE),
|
||||
strict_shape = as.logical(strict_shape),
|
||||
iteration_begin = box(as.integer(iterationrange[1])),
|
||||
iteration_end = box(as.integer(iterationrange[2])),
|
||||
type = box(as.integer(0))
|
||||
@ -445,96 +479,49 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
|
||||
json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
|
||||
if (is_dmatrix) {
|
||||
predts <- .Call(
|
||||
arr <- .Call(
|
||||
XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
|
||||
)
|
||||
} else if (use_as_dense_matrix) {
|
||||
predts <- .Call(
|
||||
arr <- .Call(
|
||||
XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
|
||||
)
|
||||
} else if (use_as_csr_matrix) {
|
||||
predts <- .Call(
|
||||
arr <- .Call(
|
||||
XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
|
||||
)
|
||||
} else if (use_as_df) {
|
||||
predts <- .Call(
|
||||
arr <- .Call(
|
||||
XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
|
||||
)
|
||||
}
|
||||
|
||||
names(predts) <- c("shape", "results")
|
||||
shape <- predts$shape
|
||||
arr <- predts$results
|
||||
|
||||
n_ret <- length(arr)
|
||||
if (n_row != shape[1]) {
|
||||
stop("Incorrect predict shape.")
|
||||
}
|
||||
|
||||
.Call(XGSetArrayDimInplace_R, arr, rev(shape))
|
||||
|
||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||
n_groups <- shape[2]
|
||||
|
||||
## Needed regardless of whether strict shape is being used.
|
||||
if (predcontrib) {
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, NULL, NULL))
|
||||
} else if (predinteraction) {
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, cnames, NULL, NULL))
|
||||
}
|
||||
if (strict_shape) {
|
||||
return(arr) # strict shape is calculated by libxgboost uniformly.
|
||||
if ((predcontrib || predinteraction) && !is.null(colnames(newdata))) {
|
||||
cnames <- c(colnames(newdata), "(Intercept)")
|
||||
dim_names <- vector(mode = "list", length = length(dim(arr)))
|
||||
dim_names[[1L]] <- cnames
|
||||
if (predinteraction) dim_names[[2L]] <- cnames
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, dim_names)
|
||||
}
|
||||
|
||||
if (predleaf) {
|
||||
## Predict leaf
|
||||
if (n_ret == n_row) {
|
||||
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
|
||||
if (NROW(rnames)) {
|
||||
if (is.null(dim(arr))) {
|
||||
.Call(XGSetVectorNamesInplace_R, arr, rnames)
|
||||
} else {
|
||||
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
|
||||
}
|
||||
} else if (predcontrib) {
|
||||
## Predict contribution
|
||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||
if (n_ret == n_row) {
|
||||
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
newdim <- dim(arr)[2:3]
|
||||
newdn <- dimnames(arr)[2:3]
|
||||
arr <- arr[1, , ]
|
||||
.Call(XGSetArrayDimInplace_R, arr, newdim)
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, newdn)
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
## Predict interaction
|
||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||
if (n_ret == n_row) {
|
||||
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
arr <- arr[1, , , , drop = FALSE]
|
||||
newdim <- dim(arr)[2:4]
|
||||
newdn <- dimnames(arr)[2:4]
|
||||
.Call(XGSetArrayDimInplace_R, arr, newdim)
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, newdn)
|
||||
}
|
||||
} else {
|
||||
## Normal prediction
|
||||
if (reshape && n_groups != 1) {
|
||||
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||
} else {
|
||||
.Call(XGSetArrayDimInplace_R, arr, NULL)
|
||||
dim_names <- dimnames(arr)
|
||||
if (is.null(dim_names)) {
|
||||
dim_names <- vector(mode = "list", length = length(dim(arr)))
|
||||
}
|
||||
dim_names[[length(dim_names)]] <- rnames
|
||||
.Call(XGSetArrayDimNamesInplace_R, arr, dim_names)
|
||||
}
|
||||
}
|
||||
|
||||
if (!avoid_transpose && is.array(arr)) {
|
||||
arr <- aperm(arr)
|
||||
}
|
||||
|
||||
return(arr)
|
||||
}
|
||||
|
||||
@ -618,29 +605,20 @@ validate.features <- function(bst, newdata) {
|
||||
}
|
||||
|
||||
|
||||
#' @title Accessors for serializable attributes of a model
|
||||
#' Accessors for serializable attributes of a model
|
||||
#'
|
||||
#' @description These methods allow to manipulate the key-value attribute strings of an xgboost model.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
|
||||
#' @param name A non-empty character string specifying which attribute is to be accessed.
|
||||
#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
|
||||
#' it is a list (or an object coercible to a list) with the names of attributes to set
|
||||
#' and the elements corresponding to attribute values.
|
||||
#' Non-character values are converted to character.
|
||||
#' When an attribute value is not a scalar, only the first index is used.
|
||||
#' Use `NULL` to remove an attribute.
|
||||
#' These methods allow to manipulate the key-value attribute strings of an XGBoost model.
|
||||
#'
|
||||
#' @details
|
||||
#' The primary purpose of xgboost model attributes is to store some meta data about the model.
|
||||
#' The primary purpose of XGBoost model attributes is to store some meta data about the model.
|
||||
#' Note that they are a separate concept from the object attributes in R.
|
||||
#' Specifically, they refer to key-value strings that can be attached to an xgboost model,
|
||||
#' Specifically, they refer to key-value strings that can be attached to an XGBoost model,
|
||||
#' stored together with the model's binary representation, and accessed later
|
||||
#' (from R or any other interface).
|
||||
#' In contrast, any R attribute assigned to an R object of `xgb.Booster` class
|
||||
#' would not be saved by [xgb.save()] because an xgboost model is an external memory object
|
||||
#' would not be saved by [xgb.save()] because an XGBoost model is an external memory object
|
||||
#' and its serialization is handled externally.
|
||||
#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
||||
#' Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't
|
||||
#' change the value of that parameter for a model.
|
||||
#' Use [xgb.parameters<-()] to set or change model parameters.
|
||||
#'
|
||||
@ -650,9 +628,17 @@ validate.features <- function(bst, newdata) {
|
||||
#' Important: since this modifies the booster's C object, semantics for assignment here
|
||||
#' will differ from R's, as any object reference to the same booster will be modified
|
||||
#' too, while assignment of R attributes through `attributes(model)$<attr> <- <value>`
|
||||
#' will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
|
||||
#' will follow the usual copy-on-write R semantics (see [xgb.copy.Booster()] for an
|
||||
#' example of these behaviors).
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`. **Will be modified in-place** when assigning to it.
|
||||
#' @param name A non-empty character string specifying which attribute is to be accessed.
|
||||
#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
|
||||
#' it is a list (or an object coercible to a list) with the names of attributes to set
|
||||
#' and the elements corresponding to attribute values.
|
||||
#' Non-character values are converted to character.
|
||||
#' When an attribute value is not a scalar, only the first index is used.
|
||||
#' Use `NULL` to remove an attribute.
|
||||
#' @return
|
||||
#' - `xgb.attr()` returns either a string value of an attribute
|
||||
#' or `NULL` if an attribute wasn't stored in a model.
|
||||
@ -663,9 +649,8 @@ validate.features <- function(bst, newdata) {
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data,
|
||||
#' label = train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
@ -750,15 +735,18 @@ xgb.attributes <- function(object) {
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' @title Accessors for model parameters as JSON string
|
||||
#' @details Note that assignment is performed in-place on the booster C object, which unlike assignment
|
||||
#' Accessors for model parameters as JSON string
|
||||
#'
|
||||
#' @details
|
||||
#' Note that assignment is performed in-place on the booster C object, which unlike assignment
|
||||
#' of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
|
||||
#' to the same booster will also get updated.
|
||||
#'
|
||||
#' See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
|
||||
#' @param value An R list.
|
||||
#' @return `xgb.config` will return the parameters as an R list.
|
||||
#' See [xgb.copy.Booster()] for an example of this behavior.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`.**Will be modified in-place** when assigning to it.
|
||||
#' @param value A list.
|
||||
#' @return Parameters as a list.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
@ -767,9 +755,8 @@ xgb.attributes <- function(object) {
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data,
|
||||
#' label = train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
@ -798,28 +785,31 @@ xgb.config <- function(object) {
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' @title Accessors for model parameters
|
||||
#' @description Only the setter for xgboost parameters is currently implemented.
|
||||
#' @details Just like \link{xgb.attr}, this function will make in-place modifications
|
||||
#' Accessors for model parameters
|
||||
#'
|
||||
#' Only the setter for XGBoost parameters is currently implemented.
|
||||
#'
|
||||
#' @details
|
||||
#' Just like [xgb.attr()], this function will make in-place modifications
|
||||
#' on the booster object which do not follow typical R assignment semantics - that is,
|
||||
#' all references to the same booster will also be updated, unlike assingment of R
|
||||
#' attributes which follow copy-on-write semantics.
|
||||
#'
|
||||
#' See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
#' See [xgb.copy.Booster()] for an example of this behavior.
|
||||
#'
|
||||
#' Be aware that setting parameters of a fitted booster related to training continuation / updates
|
||||
#' will reset its number of rounds indicator to zero.
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}.
|
||||
#' @param object Object of class `xgb.Booster`. **Will be modified in-place**.
|
||||
#' @param value A list (or an object coercible to a list) with the names of parameters to set
|
||||
#' and the elements corresponding to parameter values.
|
||||
#' @return The same booster `object`, which gets modified in-place.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data,
|
||||
#' label = train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
@ -891,11 +881,12 @@ setinfo.xgb.Booster <- function(object, name, info) {
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
#' @title Get number of boosting in a fitted booster
|
||||
#' Get number of boosting in a fitted booster
|
||||
#'
|
||||
#' @param model,x A fitted `xgb.Booster` model.
|
||||
#' @return The number of rounds saved in the model, as an integer.
|
||||
#' @return The number of rounds saved in the model as an integer.
|
||||
#' @details Note that setting booster parameters related to training
|
||||
#' continuation / updates through \link{xgb.parameters<-} will reset the
|
||||
#' continuation / updates through [xgb.parameters<-()] will reset the
|
||||
#' number of rounds to zero.
|
||||
#' @export
|
||||
#' @rdname xgb.get.num.boosted.rounds
|
||||
@ -909,16 +900,19 @@ length.xgb.Booster <- function(x) {
|
||||
return(xgb.get.num.boosted.rounds(x))
|
||||
}
|
||||
|
||||
#' @title Slice Booster by Rounds
|
||||
#' @description Creates a new booster including only a selected range of rounds / iterations
|
||||
#' Slice Booster by Rounds
|
||||
#'
|
||||
#' Creates a new booster including only a selected range of rounds / iterations
|
||||
#' from an existing booster, as given by the sequence `seq(start, end, step)`.
|
||||
#' @details Note that any R attributes that the booster might have, will not be copied into
|
||||
#'
|
||||
#' @details
|
||||
#' Note that any R attributes that the booster might have, will not be copied into
|
||||
#' the resulting object.
|
||||
#'
|
||||
#' @param model,x A fitted `xgb.Booster` object, which is to be sliced by taking only a subset
|
||||
#' of its rounds / iterations.
|
||||
#' @param start Start of the slice (base-1 and inclusive, like R's \link{seq}).
|
||||
#' @param end End of the slice (base-1 and inclusive, like R's \link{seq}).
|
||||
#'
|
||||
#' @param start Start of the slice (base-1 and inclusive, like R's [seq()]).
|
||||
#' @param end End of the slice (base-1 and inclusive, like R's [seq()]).
|
||||
#' Passing a value of zero here is equivalent to passing the full number of rounds in the
|
||||
#' booster object.
|
||||
#' @param step Step size of the slice. Passing '1' will take every round in the sequence defined by
|
||||
@ -926,8 +920,10 @@ length.xgb.Booster <- function(x) {
|
||||
#' @return A sliced booster object containing only the requested rounds.
|
||||
#' @examples
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#'
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
#' model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
|
||||
#' model_slice <- xgb.slice.Booster(model, 1, 3)
|
||||
@ -980,10 +976,12 @@ xgb.slice.Booster <- function(model, start, end = xgb.get.num.boosted.rounds(mod
|
||||
return(xgb.slice.Booster(x, i[1L], i[length(i)], steps[1L]))
|
||||
}
|
||||
|
||||
#' @title Get Features Names from Booster
|
||||
#' @description Returns the feature / variable / column names from a fitted
|
||||
#' booster object, which are set automatically during the call to \link{xgb.train}
|
||||
#' from the DMatrix names, or which can be set manually through \link{setinfo}.
|
||||
#' Get Features Names from Booster
|
||||
#'
|
||||
#' @description
|
||||
#' Returns the feature / variable / column names from a fitted
|
||||
#' booster object, which are set automatically during the call to [xgb.train()]
|
||||
#' from the DMatrix names, or which can be set manually through [setinfo()].
|
||||
#'
|
||||
#' If the object doesn't have feature names, will return `NULL`.
|
||||
#'
|
||||
@ -1034,23 +1032,25 @@ xgb.best_iteration <- function(bst) {
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Extract coefficients from linear booster
|
||||
#' @description Extracts the coefficients from a 'gblinear' booster object,
|
||||
#' as produced by \code{xgb.train} when using parameter `booster="gblinear"`.
|
||||
#' Extract coefficients from linear booster
|
||||
#'
|
||||
#' @description
|
||||
#' Extracts the coefficients from a 'gblinear' booster object,
|
||||
#' as produced by [xgb.train()] when using parameter `booster="gblinear"`.
|
||||
#'
|
||||
#' Note: this function will error out if passing a booster model
|
||||
#' which is not of "gblinear" type.
|
||||
#'
|
||||
#' @param object A fitted booster of 'gblinear' type.
|
||||
#' @param ... Not used.
|
||||
#' @return The extracted coefficients:\itemize{
|
||||
#' \item If there's only one coefficient per column in the data, will be returned as a
|
||||
#' vector, potentially containing the feature names if available, with the intercept
|
||||
#' as first column.
|
||||
#' \item If there's more than one coefficient per column in the data (e.g. when using
|
||||
#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
|
||||
#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
|
||||
#' (classes in multi-class classification) dimension will not be named.
|
||||
#' }
|
||||
#' @return The extracted coefficients:
|
||||
#' - If there is only one coefficient per column in the data, will be returned as a
|
||||
#' vector, potentially containing the feature names if available, with the intercept
|
||||
#' as first column.
|
||||
#' - If there is more than one coefficient per column in the data (e.g. when using
|
||||
#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
|
||||
#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
|
||||
#' (classes in multi-class classification) dimension will not be named.
|
||||
#'
|
||||
#' The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
|
||||
#' or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
|
||||
@ -1059,12 +1059,15 @@ xgb.best_iteration <- function(bst) {
|
||||
#'
|
||||
#' Be aware that the coefficients are obtained by first converting them to strings and
|
||||
#' back, so there will always be some very small lose of precision compared to the actual
|
||||
#' coefficients as used by \link{predict.xgb.Booster}.
|
||||
#' coefficients as used by [predict.xgb.Booster].
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#'
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars[, 1]
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#'
|
||||
#' dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
|
||||
#' params <- list(booster = "gblinear", nthread = 1)
|
||||
#' model <- xgb.train(data = dm, params = params, nrounds = 2)
|
||||
@ -1105,34 +1108,48 @@ coef.xgb.Booster <- function(object, ...) {
|
||||
if (n_cols == 1L) {
|
||||
out <- c(intercepts, coefs)
|
||||
if (add_names) {
|
||||
names(out) <- feature_names
|
||||
.Call(XGSetVectorNamesInplace_R, out, feature_names)
|
||||
}
|
||||
} else {
|
||||
coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
|
||||
dim(intercepts) <- c(1L, n_cols)
|
||||
out <- rbind(intercepts, coefs)
|
||||
out_names <- vector(mode = "list", length = 2)
|
||||
if (add_names) {
|
||||
row.names(out) <- feature_names
|
||||
out_names[[1L]] <- feature_names
|
||||
}
|
||||
# TODO: if a class names attributes is added,
|
||||
# should use those names here.
|
||||
if (inherits(object, "xgboost")) {
|
||||
metadata <- attributes(object)$metadata
|
||||
if (NROW(metadata$y_levels)) {
|
||||
out_names[[2L]] <- metadata$y_levels
|
||||
} else if (NROW(metadata$y_names)) {
|
||||
out_names[[2L]] <- metadata$y_names
|
||||
}
|
||||
}
|
||||
.Call(XGSetArrayDimNamesInplace_R, out, out_names)
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Deep-copies a Booster Object
|
||||
#' @description Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
#' Deep-copies a Booster Object
|
||||
#'
|
||||
#' Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
#' C object pointer contained will be a different object, and hence functions
|
||||
#' like \link{xgb.attr} will not affect the object from which it was copied.
|
||||
#' like [xgb.attr()] will not affect the object from which it was copied.
|
||||
#'
|
||||
#' @param model An 'xgb.Booster' object.
|
||||
#' @return A deep copy of `model` - it will be identical in every way, but C-level
|
||||
#' functions called on that copy will not affect the `model` variable.
|
||||
#' functions called on that copy will not affect the `model` variable.
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#'
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- mtcars[, -1]
|
||||
#'
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
#'
|
||||
#' model <- xgb.train(
|
||||
#' data = dm,
|
||||
#' params = list(nthread = 1),
|
||||
@ -1167,29 +1184,35 @@ xgb.copy.Booster <- function(model) {
|
||||
return(.Call(XGDuplicate_R, model))
|
||||
}
|
||||
|
||||
#' @title Check if two boosters share the same C object
|
||||
#' @description Checks whether two booster objects refer to the same underlying C object.
|
||||
#' @details As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
|
||||
#' Check if two boosters share the same C object
|
||||
#'
|
||||
#' Checks whether two booster objects refer to the same underlying C object.
|
||||
#'
|
||||
#' @details
|
||||
#' As booster objects (as returned by e.g. [xgb.train()]) contain an R 'externalptr'
|
||||
#' object, they don't follow typical copy-on-write semantics of other R objects - that is, if
|
||||
#' one assigns a booster to a different variable and modifies that new variable through in-place
|
||||
#' methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
|
||||
#' methods like [xgb.attr<-()], the modification will be applied to both the old and the new
|
||||
#' variable, unlike typical R assignments which would only modify the latter.
|
||||
#'
|
||||
#' This function allows checking whether two booster objects share the same 'externalptr',
|
||||
#' regardless of the R attributes that they might have.
|
||||
#'
|
||||
#' In order to duplicate a booster in such a way that the copy wouldn't share the same
|
||||
#' 'externalptr', one can use function \link{xgb.copy.Booster}.
|
||||
#' 'externalptr', one can use function [xgb.copy.Booster()].
|
||||
#' @param obj1 Booster model to compare with `obj2`.
|
||||
#' @param obj2 Booster model to compare with `obj1`.
|
||||
#' @return Either `TRUE` or `FALSE` according to whether the two boosters share
|
||||
#' the underlying C object.
|
||||
#' @seealso \link{xgb.copy.Booster}
|
||||
#' @return Either `TRUE` or `FALSE` according to whether the two boosters share the
|
||||
#' underlying C object.
|
||||
#' @seealso [xgb.copy.Booster()]
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#'
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#'
|
||||
#' model <- xgb.train(
|
||||
#' params = list(nthread = 1),
|
||||
#' data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
@ -1230,9 +1253,8 @@ xgb.is.same.Booster <- function(obj1, obj2) {
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data,
|
||||
#' label = train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
@ -1243,10 +1265,10 @@ xgb.is.same.Booster <- function(obj1, obj2) {
|
||||
#' attr(bst, "myattr") <- "memo"
|
||||
#'
|
||||
#' print(bst)
|
||||
#'
|
||||
#' @method print xgb.Booster
|
||||
#' @export
|
||||
print.xgb.Booster <- function(x, ...) {
|
||||
# this lets it error out when the object comes from an earlier R xgboost version
|
||||
# this lets it error out when the object comes from an earlier R XGBoost version
|
||||
handle <- xgb.get.handle(x)
|
||||
cat('##### xgb.Booster\n')
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
#' Construct xgb.DMatrix object
|
||||
#'
|
||||
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||
#' such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||
#' such as [xgb.train()] or [predict()].
|
||||
#'
|
||||
#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||
#' Function `xgb.QuantileDMatrix()` will construct a DMatrix with quantization for the histogram
|
||||
#' method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
|
||||
@ -24,20 +24,20 @@
|
||||
#'
|
||||
#' Other column types are not supported.
|
||||
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for
|
||||
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for
|
||||
#' 'xgb.QuantileDMatrix'.
|
||||
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
||||
#' as a single row (only when making predictions from a fitted model).
|
||||
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
|
||||
#' the file, with an optional format specifier.
|
||||
#'
|
||||
#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
|
||||
#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
|
||||
#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
|
||||
#' \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
|
||||
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
||||
#' `?format=libsvm` at the end of the file path. It will be the default format if not
|
||||
#' otherwise specified.
|
||||
#' `?format=libsvm` at the end of the file path. It will be the default format if not
|
||||
#' otherwise specified.
|
||||
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
|
||||
#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
|
||||
#' `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions.
|
||||
#' }
|
||||
#'
|
||||
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
||||
@ -54,44 +54,41 @@
|
||||
#' integers with numeration starting at zero.
|
||||
#' @param weight Weight for each instance.
|
||||
#'
|
||||
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||
#' is assigned to each group (not each data point). This is because we
|
||||
#' only care about the relative ordering of data points within each group,
|
||||
#' so it doesn't make sense to assign weights to individual data points.
|
||||
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||
#' is assigned to each group (not each data point). This is because we
|
||||
#' only care about the relative ordering of data points within each group,
|
||||
#' so it doesn't make sense to assign weights to individual data points.
|
||||
#' @param base_margin Base margin used for boosting from existing model.
|
||||
#'
|
||||
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
|
||||
#' from text files).
|
||||
#' It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||
#' values in data.
|
||||
#' from text files). It is useful to change when a zero, infinite, or some other
|
||||
#' extreme value represents missing values in data.
|
||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||
#' @param feature_names Set names for features. Overrides column names in data
|
||||
#' frame and matrix.
|
||||
#' @param feature_names Set names for features. Overrides column names in data frame and matrix.
|
||||
#'
|
||||
#' Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
#' must be the same as in the DMatrix construction, regardless of the column names.
|
||||
#' Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
#' must be the same as in the DMatrix construction, regardless of the column names.
|
||||
#' @param feature_types Set types for features.
|
||||
#'
|
||||
#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
|
||||
#' automatically from the column types.
|
||||
#' If `data` is a `data.frame` and passing `feature_types` is not supplied,
|
||||
#' feature types will be deduced automatically from the column types.
|
||||
#'
|
||||
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
||||
#' with the following possible values:\itemize{
|
||||
#' \item "c", which represents categorical columns.
|
||||
#' \item "q", which represents numeric columns.
|
||||
#' \item "int", which represents integer columns.
|
||||
#' \item "i", which represents logical (boolean) columns.
|
||||
#' }
|
||||
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
||||
#' with the following possible values:
|
||||
#' - "c", which represents categorical columns.
|
||||
#' - "q", which represents numeric columns.
|
||||
#' - "int", which represents integer columns.
|
||||
#' - "i", which represents logical (boolean) columns.
|
||||
#'
|
||||
#' Note that, while categorical types are treated differently from the rest for model fitting
|
||||
#' purposes, the other types do not influence the generated model, but have effects in other
|
||||
#' functionalities such as feature importances.
|
||||
#' Note that, while categorical types are treated differently from the rest for model fitting
|
||||
#' purposes, the other types do not influence the generated model, but have effects in other
|
||||
#' functionalities such as feature importances.
|
||||
#'
|
||||
#' \bold{Important}: categorical features, if specified manually through `feature_types`, must
|
||||
#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||
#' applied when passing data to `predict`. Even if passing `factor` types, the encoding will
|
||||
#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
|
||||
#' **Important**: Categorical features, if specified manually through `feature_types`, must
|
||||
#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||
#' applied when passing data to [predict()]. Even if passing `factor` types, the encoding will
|
||||
#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
|
||||
#' @param nthread Number of threads used for creating DMatrix.
|
||||
#' @param group Group size for all ranking group.
|
||||
#' @param qid Query ID for data samples, used for ranking.
|
||||
@ -99,23 +96,24 @@
|
||||
#' @param label_upper_bound Upper bound for survival training.
|
||||
#' @param feature_weights Set feature weights for column sampling.
|
||||
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
|
||||
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
|
||||
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
|
||||
#'
|
||||
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
||||
#' how the file was split beforehand. Default to row.
|
||||
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
||||
#' how the file was split beforehand. Default to row.
|
||||
#'
|
||||
#' This is not used when `data` is not a URI.
|
||||
#' This is not used when `data` is not a URI.
|
||||
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||
#' subclass 'xgb.QuantileDMatrix'.
|
||||
#'
|
||||
#' @details
|
||||
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
#' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()].
|
||||
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
#' from the original source of data.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
#' data.table::setDTthreads(nthread)
|
||||
@ -318,13 +316,13 @@ xgb.DMatrix <- function(
|
||||
}
|
||||
|
||||
#' @param ref The training dataset that provides quantile information, needed when creating
|
||||
#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix
|
||||
#' validation/test dataset with [xgb.QuantileDMatrix()]. Supplying the training DMatrix
|
||||
#' as a reference means that the same quantisation applied to the training data is
|
||||
#' applied to the validation/test data
|
||||
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
|
||||
#' `max_bin`.
|
||||
#' `max_bin`.
|
||||
#'
|
||||
#' This is only supported when constructing a QuantileDMatrix.
|
||||
#' This is only supported when constructing a QuantileDMatrix.
|
||||
#' @export
|
||||
#' @rdname xgb.DMatrix
|
||||
xgb.QuantileDMatrix <- function(
|
||||
@ -411,40 +409,42 @@ xgb.QuantileDMatrix <- function(
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
#' @title XGBoost Data Iterator
|
||||
#' @description Interface to create a custom data iterator in order to construct a DMatrix
|
||||
#' XGBoost Data Iterator
|
||||
#'
|
||||
#' @description
|
||||
#' Interface to create a custom data iterator in order to construct a DMatrix
|
||||
#' from external memory.
|
||||
#'
|
||||
#' This function is responsible for generating an R object structure containing callback
|
||||
#' functions and an environment shared with them.
|
||||
#'
|
||||
#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||
#' The output structure from this function is then meant to be passed to [xgb.ExtMemDMatrix()],
|
||||
#' which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||
#'
|
||||
#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
#' For more information, and for a usage example, see the documentation for [xgb.ExtMemDMatrix()].
|
||||
#'
|
||||
#' @param env An R environment to pass to the callback functions supplied here, which can be
|
||||
#' used to keep track of variables to determine how to handle the batches.
|
||||
#' used to keep track of variables to determine how to handle the batches.
|
||||
#'
|
||||
#' For example, one might want to keep track of an iteration number in this environment in order
|
||||
#' to know which part of the data to pass next.
|
||||
#' @param f_next `function(env)` which is responsible for:\itemize{
|
||||
#' \item Accessing or retrieving the next batch of data in the iterator.
|
||||
#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
|
||||
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||
#' be done by modifiying variables in the `env` variable that is passed here.
|
||||
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
|
||||
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||
#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
|
||||
#' }
|
||||
#' For example, one might want to keep track of an iteration number in this environment in order
|
||||
#' to know which part of the data to pass next.
|
||||
#' @param f_next `function(env)` which is responsible for:
|
||||
#' - Accessing or retrieving the next batch of data in the iterator.
|
||||
#' - Supplying this data by calling function [xgb.DataBatch()] on it and returning the result.
|
||||
#' - Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||
#' be done by modifiying variables in the `env` variable that is passed here.
|
||||
#' - Signaling whether there are more batches to be consumed or not, by returning `NULL`
|
||||
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||
#' calling [xgb.DataBatch()] when there are more batches in the line to be consumed.
|
||||
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
|
||||
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||
#' has been consumed).
|
||||
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||
#' has been consumed).
|
||||
#'
|
||||
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||
#' (and in the same order) must be passed in subsequent iterations.
|
||||
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||
#' (and in the same order) must be passed in subsequent iterations.
|
||||
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
|
||||
#' be passed to \link{xgb.ExternalDMatrix}.
|
||||
#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
|
||||
#' be passed to [xgb.ExtMemDMatrix()].
|
||||
#' @seealso [xgb.ExtMemDMatrix()], [xgb.DataBatch()].
|
||||
#' @export
|
||||
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
||||
if (!is.function(f_next)) {
|
||||
@ -508,38 +508,39 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Structure for Data Batches
|
||||
#' @description Helper function to supply data in batches of a data iterator when
|
||||
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||
#' Structure for Data Batches
|
||||
#'
|
||||
#' This function is \bold{only} meant to be called inside of a callback function (which
|
||||
#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||
#' @description
|
||||
#' Helper function to supply data in batches of a data iterator when
|
||||
#' constructing a DMatrix from external memory through [xgb.ExtMemDMatrix()]
|
||||
#' or through [xgb.QuantileDMatrix.from_iterator()].
|
||||
#'
|
||||
#' This function is **only** meant to be called inside of a callback function (which
|
||||
#' is passed as argument to function [xgb.DataIter()] to construct a data iterator)
|
||||
#' when constructing a DMatrix through external memory - otherwise, one should call
|
||||
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||
#' [xgb.DMatrix()] or [xgb.QuantileDMatrix()].
|
||||
#'
|
||||
#' The object that results from calling this function directly is \bold{not} like
|
||||
#' The object that results from calling this function directly is **not** like
|
||||
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
|
||||
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||
#'
|
||||
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
#' For more information and for example usage, see the documentation for [xgb.ExtMemDMatrix()].
|
||||
#' @inheritParams xgb.DMatrix
|
||||
#' @param data Batch of data belonging to this batch.
|
||||
#'
|
||||
#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||
#' to pass here. Supported types are:\itemize{
|
||||
#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
|
||||
#' `integer` and `logical`, missing values might not be automatically recognized as
|
||||
#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix}
|
||||
#' for details on this.
|
||||
#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
|
||||
#' conversions applied to it. See the documentation for parameter `data` in
|
||||
#' \link{xgb.DMatrix} for details on it.
|
||||
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||
#' }
|
||||
#' Note that not all of the input types supported by [xgb.DMatrix()] are possible
|
||||
#' to pass here. Supported types are:
|
||||
#' - `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
|
||||
#' `integer` and `logical`, missing values might not be automatically recognized as
|
||||
#' as such - see the documentation for parameter `missing` in [xgb.ExtMemDMatrix()]
|
||||
#' for details on this.
|
||||
#' - `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
|
||||
#' conversions applied to it. See the documentation for parameter `data` in
|
||||
#' [xgb.DMatrix()] for details on it.
|
||||
#' - CSR matrices, as class `dgRMatrix` from package "Matrix".
|
||||
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
|
||||
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||
#' data and parameters passed here. It does **not** inherit from `xgb.DMatrix`.
|
||||
#' @seealso [xgb.DataIter()], [xgb.ExtMemDMatrix()].
|
||||
#' @export
|
||||
xgb.DataBatch <- function(
|
||||
data,
|
||||
@ -616,42 +617,43 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
|
||||
return(1L)
|
||||
}
|
||||
|
||||
#' @title DMatrix from External Data
|
||||
#' @description Create a special type of xgboost 'DMatrix' object from external data
|
||||
#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||
#' DMatrix from External Data
|
||||
#'
|
||||
#' @description
|
||||
#' Create a special type of XGBoost 'DMatrix' object from external data
|
||||
#' supplied by an [xgb.DataIter()] object, potentially passed in batches from a
|
||||
#' bigger set that might not fit entirely in memory.
|
||||
#'
|
||||
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||
#' without being concatenated, but note that fields like 'label' \bold{will} be
|
||||
#' without being concatenated, but note that fields like 'label' **will** be
|
||||
#' concatenated from multiple calls to the data iterator.
|
||||
#'
|
||||
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
#' @inheritParams xgb.DMatrix
|
||||
#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter},
|
||||
#' which includes an environment shared between function calls, and functions to access
|
||||
#' the data in batches on-demand.
|
||||
#' @param data_iterator A data iterator structure as returned by [xgb.DataIter()],
|
||||
#' which includes an environment shared between function calls, and functions to access
|
||||
#' the data in batches on-demand.
|
||||
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
|
||||
#' @param missing A float value to represents missing values in data.
|
||||
#'
|
||||
#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it
|
||||
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
|
||||
#' it will not be adapted for different input types.
|
||||
#' Note that, while functions like [xgb.DMatrix()] can take a generic `NA` and interpret it
|
||||
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
|
||||
#' it will not be adapted for different input types.
|
||||
#'
|
||||
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
|
||||
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
|
||||
#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
|
||||
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||
#' held internally but accessed through the iterator when needed.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
|
||||
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
|
||||
#' which is interpreted as a floating-point NaN by [xgb.ExtMemDMatrix()] and by
|
||||
#' [xgb.QuantileDMatrix.from_iterator()], these integer missing values will not be treated as missing.
|
||||
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
|
||||
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
|
||||
#' held internally but accessed through the iterator when needed.
|
||||
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.QuantileDMatrix.from_iterator()]
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' # this custom environment will be passed to the iterator
|
||||
#' # functions at each call. It's up to the user to keep
|
||||
#' # This custom environment will be passed to the iterator
|
||||
#' # functions at each call. It is up to the user to keep
|
||||
#' # track of the iteration number in this environment.
|
||||
#' iterator_env <- as.environment(
|
||||
#' list(
|
||||
@ -704,7 +706,7 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
|
||||
#' cache_prefix <- tempdir()
|
||||
#'
|
||||
#' # DMatrix will be constructed from the iterator's batches
|
||||
#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
#' dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
#'
|
||||
#' # After construction, can be used as a regular DMatrix
|
||||
#' params <- list(nthread = 1, objective = "reg:squarederror")
|
||||
@ -715,7 +717,7 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
|
||||
#' pred_dm <- predict(model, dm)
|
||||
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||
#' @export
|
||||
xgb.ExternalDMatrix <- function(
|
||||
xgb.ExtMemDMatrix <- function(
|
||||
data_iterator,
|
||||
cache_prefix = tempdir(),
|
||||
missing = NA,
|
||||
@ -751,32 +753,34 @@ xgb.ExternalDMatrix <- function(
|
||||
)
|
||||
|
||||
attributes(dmat) <- list(
|
||||
class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
|
||||
class = c("xgb.DMatrix", "xgb.ExtMemDMatrix"),
|
||||
fields = attributes(proxy_handle)$fields
|
||||
)
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
|
||||
#' @title QuantileDMatrix from External Data
|
||||
#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
|
||||
#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||
#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||
#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||
#' QuantileDMatrix from External Data
|
||||
#'
|
||||
#' @description
|
||||
#' Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
|
||||
#' calling function [xgb.QuantileDMatrix()], with the same advantages and limitations) from
|
||||
#' external data supplied by [xgb.DataIter()], potentially passed in batches from
|
||||
#' a bigger set that might not fit entirely in memory, same way as [xgb.ExtMemDMatrix()].
|
||||
#'
|
||||
#' Note that, while external data will only be loaded through the iterator (thus the full data
|
||||
#' might not be held entirely in-memory), the quantized representation of the data will get
|
||||
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||
#' version is typically lighter than the original data, so there might be cases in which this
|
||||
#' representation could potentially fit in memory even if the full data doesn't.
|
||||
#' representation could potentially fit in memory even if the full data does not.
|
||||
#'
|
||||
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
#' @inheritParams xgb.ExternalDMatrix
|
||||
#' @inheritParams xgb.ExtMemDMatrix
|
||||
#' @inheritParams xgb.QuantileDMatrix
|
||||
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
|
||||
#' \link{xgb.QuantileDMatrix}
|
||||
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.ExtMemDMatrix()],
|
||||
#' [xgb.QuantileDMatrix()]
|
||||
#' @export
|
||||
xgb.QuantileDMatrix.from_iterator <- function( # nolint
|
||||
data_iterator,
|
||||
@ -823,18 +827,18 @@ xgb.QuantileDMatrix.from_iterator <- function( # nolint
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
#' @title Check whether DMatrix object has a field
|
||||
#' @description Checks whether an xgb.DMatrix object has a given field assigned to
|
||||
#' Check whether DMatrix object has a field
|
||||
#'
|
||||
#' Checks whether an xgb.DMatrix object has a given field assigned to
|
||||
#' it, such as weights, labels, etc.
|
||||
#' @param object The DMatrix object to check for the given \code{info} field.
|
||||
#' @param info The field to check for presence or absence in \code{object}.
|
||||
#' @seealso \link{xgb.DMatrix}, \link{getinfo.xgb.DMatrix}, \link{setinfo.xgb.DMatrix}
|
||||
#' @param object The DMatrix object to check for the given `info` field.
|
||||
#' @param info The field to check for presence or absence in `object`.
|
||||
#' @seealso [xgb.DMatrix()], [getinfo.xgb.DMatrix()], [setinfo.xgb.DMatrix()]
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' x <- matrix(1:10, nrow = 5)
|
||||
#' dm <- xgb.DMatrix(x, nthread = 1)
|
||||
#'
|
||||
#' # 'dm' so far doesn't have any fields set
|
||||
#' # 'dm' so far does not have any fields set
|
||||
#' xgb.DMatrix.hasinfo(dm, "label")
|
||||
#'
|
||||
#' # Fields can be added after construction
|
||||
@ -853,49 +857,21 @@ xgb.DMatrix.hasinfo <- function(object, info) {
|
||||
}
|
||||
|
||||
|
||||
# get dmatrix from data, label
|
||||
# internal helper method
|
||||
xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
|
||||
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
|
||||
if (is.null(label)) {
|
||||
stop("label must be provided when data is a matrix")
|
||||
}
|
||||
dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
|
||||
if (!is.null(weight)) {
|
||||
setinfo(dtrain, "weight", weight)
|
||||
}
|
||||
} else {
|
||||
if (!is.null(label)) {
|
||||
warning("xgboost: label will be ignored.")
|
||||
}
|
||||
if (is.character(data)) {
|
||||
data <- path.expand(data)
|
||||
dtrain <- xgb.DMatrix(data[1])
|
||||
} else if (inherits(data, "xgb.DMatrix")) {
|
||||
dtrain <- data
|
||||
} else if (inherits(data, "data.frame")) {
|
||||
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
|
||||
} else {
|
||||
stop("xgboost: invalid input data")
|
||||
}
|
||||
}
|
||||
return(dtrain)
|
||||
}
|
||||
|
||||
|
||||
#' Dimensions of xgb.DMatrix
|
||||
#'
|
||||
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
|
||||
#' @param x Object of class \code{xgb.DMatrix}
|
||||
#' Returns a vector of numbers of rows and of columns in an `xgb.DMatrix`.
|
||||
#'
|
||||
#' @param x Object of class `xgb.DMatrix`
|
||||
#'
|
||||
#' @details
|
||||
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
|
||||
#' be directly used with an \code{xgb.DMatrix} object.
|
||||
#' Note: since [nrow()] and [ncol()] internally use [dim()], they can also
|
||||
#' be directly used with an `xgb.DMatrix` object.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
|
||||
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
||||
#'
|
||||
#' stopifnot(nrow(dtrain) == nrow(train$data))
|
||||
#' stopifnot(ncol(dtrain) == ncol(train$data))
|
||||
@ -907,27 +883,28 @@ dim.xgb.DMatrix <- function(x) {
|
||||
}
|
||||
|
||||
|
||||
#' Handling of column names of \code{xgb.DMatrix}
|
||||
#' Handling of column names of `xgb.DMatrix`
|
||||
#'
|
||||
#' Only column names are supported for \code{xgb.DMatrix}, thus setting of
|
||||
#' row names would have no effect and returned row names would be NULL.
|
||||
#' Only column names are supported for `xgb.DMatrix`, thus setting of
|
||||
#' row names would have no effect and returned row names would be `NULL`.
|
||||
#'
|
||||
#' @param x object of class \code{xgb.DMatrix}
|
||||
#' @param value a list of two elements: the first one is ignored
|
||||
#' and the second one is column names
|
||||
#' @param x Object of class `xgb.DMatrix`.
|
||||
#' @param value A list of two elements: the first one is ignored
|
||||
#' and the second one is column names
|
||||
#'
|
||||
#' @details
|
||||
#' Generic \code{dimnames} methods are used by \code{colnames}.
|
||||
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
|
||||
#' Generic [dimnames()] methods are used by [colnames()].
|
||||
#' Since row names are irrelevant, it is recommended to use [colnames()] directly.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
|
||||
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
||||
#' dimnames(dtrain)
|
||||
#' colnames(dtrain)
|
||||
#' colnames(dtrain) <- make.names(1:ncol(train$data))
|
||||
#' print(dtrain, verbose=TRUE)
|
||||
#' print(dtrain, verbose = TRUE)
|
||||
#'
|
||||
#' @rdname dimnames.xgb.DMatrix
|
||||
#' @export
|
||||
@ -956,47 +933,45 @@ dimnames.xgb.DMatrix <- function(x) {
|
||||
}
|
||||
|
||||
|
||||
#' @title Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`.
|
||||
#' @param name the name of the information field to get (see details)
|
||||
#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE`
|
||||
#' if it succeeds.
|
||||
#' @details
|
||||
#' The \code{name} field can be one of the following for `xgb.DMatrix`:
|
||||
#' Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label}
|
||||
#' \item \code{weight}
|
||||
#' \item \code{base_margin}
|
||||
#' \item \code{label_lower_bound}
|
||||
#' \item \code{label_upper_bound}
|
||||
#' \item \code{group}
|
||||
#' \item \code{feature_type}
|
||||
#' \item \code{feature_name}
|
||||
#' \item \code{nrow}
|
||||
#' }
|
||||
#' See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
#' @param object Object of class `xgb.DMatrix` or `xgb.Booster`.
|
||||
#' @param name The name of the information field to get (see details).
|
||||
#' @return For `getinfo()`, will return the requested field. For `setinfo()`,
|
||||
#' will always return value `TRUE` if it succeeds.
|
||||
#' @details
|
||||
#' The `name` field can be one of the following for `xgb.DMatrix`:
|
||||
#' - label
|
||||
#' - weight
|
||||
#' - base_margin
|
||||
#' - label_lower_bound
|
||||
#' - label_upper_bound
|
||||
#' - group
|
||||
#' - feature_type
|
||||
#' - feature_name
|
||||
#' - nrow
|
||||
#'
|
||||
#' See the documentation for [xgb.DMatrix()] for more information about these fields.
|
||||
#'
|
||||
#' For `xgb.Booster`, can be one of the following:
|
||||
#' \itemize{
|
||||
#' \item \code{feature_type}
|
||||
#' \item \code{feature_name}
|
||||
#' }
|
||||
#' - `feature_type`
|
||||
#' - `feature_name`
|
||||
#'
|
||||
#' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
#' Note that, while 'qid' cannot be retrieved, it is possible to get the equivalent 'group'
|
||||
#' for a DMatrix that had 'qid' assigned.
|
||||
#'
|
||||
#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See
|
||||
#' \link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
#' **Important**: when calling [setinfo()], the objects are modified in-place. See
|
||||
#' [xgb.copy.Booster()] for an idea of this in-place assignment works.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#'
|
||||
#' labels <- getinfo(dtrain, 'label')
|
||||
#' setinfo(dtrain, 'label', 1-labels)
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' setinfo(dtrain, "label", 1 - labels)
|
||||
#'
|
||||
#' labels2 <- getinfo(dtrain, 'label')
|
||||
#' stopifnot(all(labels2 == 1-labels))
|
||||
#' labels2 <- getinfo(dtrain, "label")
|
||||
#' stopifnot(all(labels2 == 1 - labels))
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
getinfo <- function(object, name) UseMethod("getinfo")
|
||||
@ -1041,28 +1016,29 @@ getinfo.xgb.DMatrix <- function(object, name) {
|
||||
}
|
||||
|
||||
#' @rdname getinfo
|
||||
#' @param info the specific field of information to set
|
||||
#' @param info The specific field of information to set.
|
||||
#'
|
||||
#' @details
|
||||
#' See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
#' See the documentation for [xgb.DMatrix()] for possible fields that can be set
|
||||
#' (which correspond to arguments in that function).
|
||||
#'
|
||||
#' Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
#' but \bold{aren't} allowed here:\itemize{
|
||||
#' \item data
|
||||
#' \item missing
|
||||
#' \item silent
|
||||
#' \item nthread
|
||||
#' }
|
||||
#' Note that the following fields are allowed in the construction of an `xgb.DMatrix`
|
||||
#' but **are not** allowed here:
|
||||
#' - data
|
||||
#' - missing
|
||||
#' - silent
|
||||
#' - nthread
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#'
|
||||
#' labels <- getinfo(dtrain, 'label')
|
||||
#' setinfo(dtrain, 'label', 1-labels)
|
||||
#' labels2 <- getinfo(dtrain, 'label')
|
||||
#' stopifnot(all.equal(labels2, 1-labels))
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' setinfo(dtrain, "label", 1 - labels)
|
||||
#'
|
||||
#' labels2 <- getinfo(dtrain, "label")
|
||||
#' stopifnot(all.equal(labels2, 1 - labels))
|
||||
#' @export
|
||||
setinfo <- function(object, name, info) UseMethod("setinfo")
|
||||
|
||||
@ -1147,9 +1123,11 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
stop("setinfo: unknown info name ", name)
|
||||
}
|
||||
|
||||
#' @title Get Quantile Cuts from DMatrix
|
||||
#' @description Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
|
||||
#' that has been quantized for the histogram method (`tree_method="hist"`).
|
||||
#' Get Quantile Cuts from DMatrix
|
||||
#'
|
||||
#' @description
|
||||
#' Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
|
||||
#' that has been quantized for the histogram method (`tree_method = "hist"`).
|
||||
#'
|
||||
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
|
||||
@ -1160,19 +1138,18 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
#' which will be output in sorted order from lowest to highest.
|
||||
#'
|
||||
#' Different columns can have different numbers of bins according to their range.
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' @param output Output format for the quantile cuts. Possible options are:\itemize{
|
||||
#' \item `"list"` will return the output as a list with one entry per column, where
|
||||
#' each column will have a numeric vector with the cuts. The list will be named if
|
||||
#' `dmat` has column names assigned to it.
|
||||
#' \item `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
|
||||
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||
#' `indptr[i]+1` to `indptr[i+1]`.
|
||||
#' }
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
||||
#' @param output Output format for the quantile cuts. Possible options are:
|
||||
#' - "list"` will return the output as a list with one entry per column, where
|
||||
#' each column will have a numeric vector with the cuts. The list will be named if
|
||||
#' `dmat` has column names assigned to it.
|
||||
#' - `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
|
||||
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||
#' ` indptr[i]+1` to `indptr[i+1]`.
|
||||
#' @return The quantile cuts, in the format specified by parameter `output`.
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
@ -1180,11 +1157,7 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
#' # DMatrix is not quantized right away, but will be once a hist model is generated
|
||||
#' model <- xgb.train(
|
||||
#' data = dm,
|
||||
#' params = list(
|
||||
#' tree_method = "hist",
|
||||
#' max_bin = 8,
|
||||
#' nthread = 1
|
||||
#' ),
|
||||
#' params = list(tree_method = "hist", max_bin = 8, nthread = 1),
|
||||
#' nrounds = 3
|
||||
#' )
|
||||
#'
|
||||
@ -1219,17 +1192,19 @@ xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
|
||||
}
|
||||
}
|
||||
|
||||
#' @title Get Number of Non-Missing Entries in DMatrix
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' @return The number of non-missing entries in the DMatrix
|
||||
#' Get Number of Non-Missing Entries in DMatrix
|
||||
#'
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
||||
#' @return The number of non-missing entries in the DMatrix.
|
||||
#' @export
|
||||
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
|
||||
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||
return(.Call(XGDMatrixNumNonMissing_R, dmat))
|
||||
}
|
||||
|
||||
#' @title Get DMatrix Data
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' Get DMatrix Data
|
||||
#'
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
||||
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
|
||||
#' from package `Matrix`). If it had feature names, these will be added as column names
|
||||
#' in the output.
|
||||
@ -1253,27 +1228,27 @@ xgb.get.DMatrix.data <- function(dmat) {
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' original xgb.DMatrix object
|
||||
#' Slice DMatrix
|
||||
#'
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' original xgb.DMatrix object
|
||||
#' Get a new DMatrix containing the specified rows of original xgb.DMatrix object.
|
||||
#'
|
||||
#' @param object Object of class "xgb.DMatrix".
|
||||
#' @param object Object of class `xgb.DMatrix`.
|
||||
#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
|
||||
#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
|
||||
#' equivalently `qid`) field. Note that in such case, the result will not have
|
||||
#' the groups anymore - they need to be set manually through `setinfo`.
|
||||
#' @param colset currently not used (columns subsetting is not available)
|
||||
#' equivalently `qid`) field. Note that in such case, the result will not have
|
||||
#' the groups anymore - they need to be set manually through [setinfo()].
|
||||
#' @param colset Currently not used (columns subsetting is not available).
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#'
|
||||
#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
|
||||
#' labels1 <- getinfo(dsub, 'label')
|
||||
#' labels1 <- getinfo(dsub, "label")
|
||||
#'
|
||||
#' dsub <- dtrain[1:42, ]
|
||||
#' labels2 <- getinfo(dsub, 'label')
|
||||
#' labels2 <- getinfo(dsub, "label")
|
||||
#' all.equal(labels1, labels2)
|
||||
#'
|
||||
#' @rdname xgb.slice.DMatrix
|
||||
@ -1322,16 +1297,17 @@ xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
|
||||
#' Print information about xgb.DMatrix.
|
||||
#' Currently it displays dimensions and presence of info-fields and colnames.
|
||||
#'
|
||||
#' @param x an xgb.DMatrix object
|
||||
#' @param verbose whether to print colnames (when present)
|
||||
#' @param ... not currently used
|
||||
#' @param x An xgb.DMatrix object.
|
||||
#' @param verbose Whether to print colnames (when present).
|
||||
#' @param ... Not currently used.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' dtrain
|
||||
#' print(dtrain, verbose=TRUE)
|
||||
#'
|
||||
#' print(dtrain, verbose = TRUE)
|
||||
#'
|
||||
#' @method print xgb.DMatrix
|
||||
#' @export
|
||||
@ -1342,8 +1318,8 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
||||
}
|
||||
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
|
||||
"xgb.QuantileDMatrix"
|
||||
} else if (inherits(x, "xgb.ExternalDMatrix")) {
|
||||
"xgb.ExternalDMatrix"
|
||||
} else if (inherits(x, "xgb.ExtMemDMatrix")) {
|
||||
"xgb.ExtMemDMatrix"
|
||||
} else if (inherits(x, "xgb.ProxyDMatrix")) {
|
||||
"xgb.ProxyDMatrix"
|
||||
} else {
|
||||
|
||||
@ -2,12 +2,13 @@
|
||||
#'
|
||||
#' Save xgb.DMatrix object to binary file
|
||||
#'
|
||||
#' @param dmatrix the \code{xgb.DMatrix} object
|
||||
#' @param dmatrix the `xgb.DMatrix` object
|
||||
#' @param fname the name of the file to write.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
#' xgb.DMatrix.save(dtrain, fname)
|
||||
|
||||
@ -1,24 +1,26 @@
|
||||
#' Set and get global configuration
|
||||
#'
|
||||
#' Global configuration consists of a collection of parameters that can be applied in the global
|
||||
#' scope. See \url{https://xgboost.readthedocs.io/en/stable/parameter.html} for the full list of
|
||||
#' parameters supported in the global configuration. Use \code{xgb.set.config} to update the
|
||||
#' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
|
||||
#' parameters supported in the global configuration. Use `xgb.set.config()` to update the
|
||||
#' values of one or more global-scope parameters. Use `xgb.get.config()` to fetch the current
|
||||
#' values of all global-scope parameters (listed in
|
||||
#' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
|
||||
#'
|
||||
#' @details
|
||||
#' Note that serialization-related functions might use a globally-configured number of threads,
|
||||
#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
|
||||
#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
|
||||
#' accept an `nthreads` parameter, but some methods like [readRDS()] might get executed before such
|
||||
#' parameter can be supplied.
|
||||
#'
|
||||
#' The number of OMP threads can in turn be configured for example through an environment variable
|
||||
#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
|
||||
#' @rdname xgbConfig
|
||||
#' @title Set and get global configuration
|
||||
#' @name xgb.set.config, xgb.get.config
|
||||
#' @export xgb.set.config xgb.get.config
|
||||
#' @param ... List of parameters to be set, as keyword arguments
|
||||
#' @return
|
||||
#' \code{xgb.set.config} returns \code{TRUE} to signal success. \code{xgb.get.config} returns
|
||||
#' `xgb.set.config()` returns `TRUE` to signal success. `xgb.get.config()` returns
|
||||
#' a list containing all global-scope parameters and their values.
|
||||
#'
|
||||
#' @examples
|
||||
|
||||
@ -1,20 +1,15 @@
|
||||
#' Create new features from a previously learned model
|
||||
#'
|
||||
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
|
||||
#'
|
||||
#' @param model decision tree boosting model learned on the original data
|
||||
#' @param data original data (usually provided as a \code{dgCMatrix} matrix)
|
||||
#' @param ... currently not used
|
||||
#'
|
||||
#' @return \code{dgCMatrix} matrix including both the original data and the new features.
|
||||
#' May improve the learning by adding new features to the training data based on the
|
||||
#' decision trees from a previously learned model.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the function inspired from the paragraph 3.1 of the paper:
|
||||
#'
|
||||
#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
|
||||
#' **Practical Lessons from Predicting Clicks on Ads at Facebook**
|
||||
#'
|
||||
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
|
||||
#' Joaquin Quinonero Candela)}
|
||||
#' *(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
|
||||
#' Joaquin Quinonero Candela)*
|
||||
#'
|
||||
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
||||
#'
|
||||
@ -33,11 +28,11 @@
|
||||
#' where the first subtree has 3 leafs and the second 2 leafs. If an
|
||||
#' instance ends up in leaf 2 in the first subtree and leaf 1 in
|
||||
#' second subtree, the overall input to the linear classifier will
|
||||
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||
#' be the binary vector `[0, 1, 0, 1, 0]`, where the first 3 entries
|
||||
#' correspond to the leaves of the first subtree and last 2 to
|
||||
#' those of the second subtree.
|
||||
#'
|
||||
#' [...]
|
||||
#' ...
|
||||
#'
|
||||
#' We can understand boosted decision tree
|
||||
#' based transformation as a supervised feature encoding that
|
||||
@ -45,16 +40,23 @@
|
||||
#' vector. A traversal from root node to a leaf node represents
|
||||
#' a rule on certain features."
|
||||
#'
|
||||
#' @param model Decision tree boosting model learned on the original data.
|
||||
#' @param data Original data (usually provided as a `dgCMatrix` matrix).
|
||||
#' @param ... Currently not used.
|
||||
#'
|
||||
#' @return A `dgCMatrix` matrix including both the original data and the new features.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#'
|
||||
#' param <- list(max_depth=2, eta=1, objective='binary:logistic')
|
||||
#' param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
#' nrounds = 4
|
||||
#'
|
||||
#' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
#' bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
#'
|
||||
#' # Model accuracy without new features
|
||||
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||
|
||||
@ -2,141 +2,141 @@
|
||||
#'
|
||||
#' The cross validation function of xgboost.
|
||||
#'
|
||||
#' @param params the list of parameters. The complete list of parameters is
|
||||
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
#' is a shorter summary:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
#' \itemize{
|
||||
#' \item \code{reg:squarederror} Regression with squared loss.
|
||||
#' \item \code{binary:logistic} logistic regression for classification.
|
||||
#' \item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
#' }
|
||||
#' \item \code{eta} step size of each boosting step
|
||||
#' \item \code{max_depth} maximum depth of the tree
|
||||
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
#' }
|
||||
#' @param params The list of parameters. The complete list of parameters is available in the
|
||||
#' [online documentation](http://xgboost.readthedocs.io/en/latest/parameter.html).
|
||||
#' Below is a shorter summary:
|
||||
#' - `objective`: Objective function, common ones are
|
||||
#' - `reg:squarederror`: Regression with squared loss.
|
||||
#' - `binary:logistic`: Logistic regression for classification.
|
||||
#'
|
||||
#' See \code{\link{xgb.train}} for further details.
|
||||
#' See also demo/ for walkthrough example in R.
|
||||
#' See [xgb.train()] for complete list of objectives.
|
||||
#' - `eta`: Step size of each boosting step
|
||||
#' - `max_depth`: Maximum depth of the tree
|
||||
#' - `nthread`: Number of threads used in training. If not set, all threads are used
|
||||
#'
|
||||
#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
|
||||
#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
|
||||
#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand.
|
||||
#' See [xgb.train()] for further details.
|
||||
#' See also demo for walkthrough example in R.
|
||||
#'
|
||||
#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
|
||||
#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
|
||||
#' system - thus, for reproducible results, one needs to call the [set.seed()] function beforehand.
|
||||
#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required
|
||||
#' for model training by the objective.
|
||||
#' for model training by the objective.
|
||||
#'
|
||||
#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
|
||||
#' or `xgb.ExternalDMatrix` are not supported here.
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
|
||||
#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
|
||||
#' or `xgb.ExtMemDMatrix` are not supported here.
|
||||
#' @param nrounds The max number of iterations.
|
||||
#' @param nfold The original dataset is randomly partitioned into `nfold` equal size subsamples.
|
||||
#' @param prediction A logical value indicating whether to return the test fold predictions
|
||||
#' from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
|
||||
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in cross validation,
|
||||
#' from each CV model. This parameter engages the [xgb.cb.cv.predict()] callback.
|
||||
#' @param showsd Logical value whether to show standard deviation of cross validation.
|
||||
#' @param metrics List of evaluation metrics to be used in cross validation,
|
||||
#' when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
#' Possible options are:
|
||||
#' \itemize{
|
||||
#' \item \code{error} binary classification error rate
|
||||
#' \item \code{rmse} Rooted mean square error
|
||||
#' \item \code{logloss} negative log-likelihood function
|
||||
#' \item \code{mae} Mean absolute error
|
||||
#' \item \code{mape} Mean absolute percentage error
|
||||
#' \item \code{auc} Area under curve
|
||||
#' \item \code{aucpr} Area under PR curve
|
||||
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
|
||||
#' }
|
||||
#' @param obj customized objective function. Returns gradient and second order
|
||||
#' gradient with given prediction and dtrain.
|
||||
#' @param feval customized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain.
|
||||
#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels. For real-valued labels in regression objectives,
|
||||
#' stratification will be done by discretizing the labels into up to 5 buckets beforehand.
|
||||
#' - `error`: Binary classification error rate
|
||||
#' - `rmse`: Root mean square error
|
||||
#' - `logloss`: Negative log-likelihood function
|
||||
#' - `mae`: Mean absolute error
|
||||
#' - `mape`: Mean absolute percentage error
|
||||
#' - `auc`: Area under curve
|
||||
#' - `aucpr`: Area under PR curve
|
||||
#' - `merror`: Exact matching error used to evaluate multi-class classification
|
||||
#' @param obj Customized objective function. Returns gradient and second order
|
||||
#' gradient with given prediction and dtrain.
|
||||
#' @param feval Customized evaluation function. Returns
|
||||
#' `list(metric='metric-name', value='metric-value')` with given prediction and dtrain.
|
||||
#' @param stratified Logical flag indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels. For real-valued labels in regression objectives,
|
||||
#' stratification will be done by discretizing the labels into up to 5 buckets beforehand.
|
||||
#'
|
||||
#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
|
||||
#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
|
||||
#' `FALSE` otherwise.
|
||||
#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
|
||||
#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
|
||||
#' `FALSE` otherwise.
|
||||
#'
|
||||
#' This parameter is ignored when `data` has a `group` field - in such case, the splitting
|
||||
#' will be based on whole groups (note that this might make the folds have different sizes).
|
||||
#' This parameter is ignored when `data` has a `group` field - in such case, the splitting
|
||||
#' will be based on whole groups (note that this might make the folds have different sizes).
|
||||
#'
|
||||
#' Value `TRUE` here is \bold{not} supported for custom objectives.
|
||||
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
#' (each element must be a vector of test fold's indices). When folds are supplied,
|
||||
#' the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
#' Value `TRUE` here is **not** supported for custom objectives.
|
||||
#' @param folds List with pre-defined CV folds (each element must be a vector of test fold's indices).
|
||||
#' When folds are supplied, the `nfold` and `stratified` parameters are ignored.
|
||||
#'
|
||||
#' If `data` has a `group` field and the objective requires this field, each fold (list element)
|
||||
#' must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
|
||||
#' and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
|
||||
#' the resulting DMatrices.
|
||||
#' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL}
|
||||
#' (the default) all indices not specified in \code{folds} will be used for training.
|
||||
#' If `data` has a `group` field and the objective requires this field, each fold (list element)
|
||||
#' must additionally have two attributes (retrievable through `attributes`) named `group_test`
|
||||
#' and `group_train`, which should hold the `group` to assign through [setinfo.xgb.DMatrix()] to
|
||||
#' the resulting DMatrices.
|
||||
#' @param train_folds List specifying which indices to use for training. If `NULL`
|
||||
#' (the default) all indices not specified in `folds` will be used for training.
|
||||
#'
|
||||
#' This is not supported when `data` has `group` field.
|
||||
#' @param verbose \code{boolean}, print the statistics during the process
|
||||
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
#' \code{\link{xgb.cb.print.evaluation}} callback.
|
||||
#' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
|
||||
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||
#' doesn't improve for \code{k} rounds.
|
||||
#' Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
|
||||
#' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
|
||||
#' then this parameter must be set as well.
|
||||
#' When it is \code{TRUE}, it means the larger the evaluation score the better.
|
||||
#' This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
|
||||
#' @param callbacks a list of callback functions to perform various task during boosting.
|
||||
#' See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#' This is not supported when `data` has `group` field.
|
||||
#' @param verbose Logical flag. Should statistics be printed during the process?
|
||||
#' @param print_every_n Print each nth iteration evaluation messages when `verbose > 0`.
|
||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
#' [xgb.cb.print.evaluation()] callback.
|
||||
#' @param early_stopping_rounds If `NULL`, the early stopping function is not triggered.
|
||||
#' If set to an integer `k`, training with a validation set will stop if the performance
|
||||
#' doesn't improve for `k` rounds.
|
||||
#' Setting this parameter engages the [xgb.cb.early.stop()] callback.
|
||||
#' @param maximize If `feval` and `early_stopping_rounds` are set,
|
||||
#' then this parameter must be set as well.
|
||||
#' When it is `TRUE`, it means the larger the evaluation score the better.
|
||||
#' This parameter is passed to the [xgb.cb.early.stop()] callback.
|
||||
#' @param callbacks A list of callback functions to perform various task during boosting.
|
||||
#' See [xgb.Callback()]. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#' @param ... Other parameters to pass to `params`.
|
||||
#'
|
||||
#' @details
|
||||
#' The original sample is randomly partitioned into \code{nfold} equal size subsamples.
|
||||
#' The original sample is randomly partitioned into `nfold` equal size subsamples.
|
||||
#'
|
||||
#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model,
|
||||
#' and the remaining \code{nfold - 1} subsamples are used as training data.
|
||||
#' Of the `nfold` subsamples, a single subsample is retained as the validation data for testing the model,
|
||||
#' and the remaining `nfold - 1` subsamples are used as training data.
|
||||
#'
|
||||
#' The cross-validation process is then repeated \code{nrounds} times, with each of the
|
||||
#' \code{nfold} subsamples used exactly once as the validation data.
|
||||
#' The cross-validation process is then repeated `nrounds` times, with each of the
|
||||
#' `nfold` subsamples used exactly once as the validation data.
|
||||
#'
|
||||
#' All observations are used for both training and validation.
|
||||
#'
|
||||
#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.cv.synchronous} with the following elements:
|
||||
#' \itemize{
|
||||
#' \item \code{call} a function call.
|
||||
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
#' capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
|
||||
#' \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
#' first column corresponding to iteration number and the rest corresponding to the
|
||||
#' CV-based evaluation means and standard deviations for the training and test CV-sets.
|
||||
#' It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
|
||||
#' \item \code{niter} number of boosting iterations.
|
||||
#' \item \code{nfeatures} number of features in training data.
|
||||
#' \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
|
||||
#' parameter or randomly generated.
|
||||
#' \item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
#' (only available with early stopping).
|
||||
#' }
|
||||
#' An object of class 'xgb.cv.synchronous' with the following elements:
|
||||
#' - `call`: Function call.
|
||||
#' - `params`: Parameters that were passed to the xgboost library. Note that it does not
|
||||
#' capture parameters changed by the [xgb.cb.reset.parameters()] callback.
|
||||
#' - `evaluation_log`: Evaluation history stored as a `data.table` with the
|
||||
#' first column corresponding to iteration number and the rest corresponding to the
|
||||
#' CV-based evaluation means and standard deviations for the training and test CV-sets.
|
||||
#' It is created by the [xgb.cb.evaluation.log()] callback.
|
||||
#' - `niter`: Number of boosting iterations.
|
||||
#' - `nfeatures`: Number of features in training data.
|
||||
#' - `folds`: The list of CV folds' indices - either those passed through the `folds`
|
||||
#' parameter or randomly generated.
|
||||
#' - `best_iteration`: Iteration number with the best evaluation metric value
|
||||
#' (only available with early stopping).
|
||||
#'
|
||||
#' Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
|
||||
#' a sub-element `pred` when passing `prediction = TRUE`, which is added by the \link{xgb.cb.cv.predict}
|
||||
#' callback (note that one can also pass it manually under `callbacks` with different settings,
|
||||
#' such as saving also the models created during cross validation); or a list `early_stop` which
|
||||
#' will contain elements such as `best_iteration` when using the early stopping callback (\link{xgb.cb.early.stop}).
|
||||
#' Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
|
||||
#' a sub-element `pred` when passing `prediction = TRUE`, which is added by the [xgb.cb.cv.predict()]
|
||||
#' callback (note that one can also pass it manually under `callbacks` with different settings,
|
||||
#' such as saving also the models created during cross validation); or a list `early_stop` which
|
||||
#' will contain elements such as `best_iteration` when using the early stopping callback ([xgb.cb.early.stop()]).
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
|
||||
#' max_depth = 3, eta = 1, objective = "binary:logistic")
|
||||
#'
|
||||
#' cv <- xgb.cv(
|
||||
#' data = dtrain,
|
||||
#' nrounds = 3,
|
||||
#' nthread = 2,
|
||||
#' nfold = 5,
|
||||
#' metrics = list("rmse","auc"),
|
||||
#' max_depth = 3,
|
||||
#' eta = 1,objective = "binary:logistic"
|
||||
#' )
|
||||
#' print(cv)
|
||||
#' print(cv, verbose=TRUE)
|
||||
#' print(cv, verbose = TRUE)
|
||||
#'
|
||||
#' @export
|
||||
xgb.cv <- function(params = list(), data, nrounds, nfold,
|
||||
@ -325,23 +325,31 @@ xgb.cv <- function(params = list(), data, nrounds, nfold,
|
||||
|
||||
#' Print xgb.cv result
|
||||
#'
|
||||
#' Prints formatted results of \code{xgb.cv}.
|
||||
#' Prints formatted results of [xgb.cv()].
|
||||
#'
|
||||
#' @param x an \code{xgb.cv.synchronous} object
|
||||
#' @param verbose whether to print detailed data
|
||||
#' @param ... passed to \code{data.table.print}
|
||||
#' @param x An `xgb.cv.synchronous` object.
|
||||
#' @param verbose Whether to print detailed data.
|
||||
#' @param ... Passed to `data.table.print()`.
|
||||
#'
|
||||
#' @details
|
||||
#' When not verbose, it would only print the evaluation results,
|
||||
#' including the best iteration (when available).
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' cv <- xgb.cv(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' nfold = 5,
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#' print(cv)
|
||||
#' print(cv, verbose=TRUE)
|
||||
#' print(cv, verbose = TRUE)
|
||||
#'
|
||||
#' @rdname print.xgb.cv
|
||||
#' @method print xgb.cv.synchronous
|
||||
|
||||
@ -1,36 +1,44 @@
|
||||
#' Dump an xgboost model in text format.
|
||||
#' Dump an XGBoost model in text format.
|
||||
#'
|
||||
#' Dump an xgboost model in text format.
|
||||
#' Dump an XGBoost model in text format.
|
||||
#'
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the text file where to save the model text dump.
|
||||
#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
|
||||
#' @param fmap feature map file representing feature types.
|
||||
#' See demo/ for walkthrough example in R, and
|
||||
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||
#' for example Format.
|
||||
#' @param with_stats whether to dump some additional statistics about the splits.
|
||||
#' When this option is on, the model dump contains two additional values:
|
||||
#' gain is the approximate loss function gain we get in each split;
|
||||
#' cover is the sum of second order gradient in each node.
|
||||
#' @param dump_format either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
#' @param model The model object.
|
||||
#' @param fname The name of the text file where to save the model text dump.
|
||||
#' If not provided or set to `NULL`, the model is returned as a character vector.
|
||||
#' @param fmap Feature map file representing feature types. See demo/ for a walkthrough
|
||||
#' example in R, and \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||
#' to see an example of the value.
|
||||
#' @param with_stats Whether to dump some additional statistics about the splits.
|
||||
#' When this option is on, the model dump contains two additional values:
|
||||
#' gain is the approximate loss function gain we get in each split;
|
||||
#' cover is the sum of second order gradient in each node.
|
||||
#' @param dump_format Either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
#'
|
||||
#' Format 'dot' for a single tree can be passed directly to packages that consume this format
|
||||
#' for graph visualization, such as function [DiagrammeR::grViz()]
|
||||
#' @param ... currently not used
|
||||
#' Format 'dot' for a single tree can be passed directly to packages that consume this format
|
||||
#' for graph visualization, such as function `DiagrammeR::grViz()`
|
||||
#' @param ... Currently not used
|
||||
#'
|
||||
#' @return
|
||||
#' If fname is not provided or set to \code{NULL} the function will return the model
|
||||
#' as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
#' If fname is not provided or set to `NULL` the function will return the model
|
||||
#' as a character vector. Otherwise it will return `TRUE`.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' # save the model in file 'xgb.model.dump'
|
||||
#' dump_path = file.path(tempdir(), 'model.dump')
|
||||
#' xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
@ -39,7 +47,7 @@
|
||||
#' print(xgb.dump(bst, with_stats = TRUE))
|
||||
#'
|
||||
#' # print in JSON format:
|
||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format = "json"))
|
||||
#'
|
||||
#' # plot first tree leveraging the 'dot' format
|
||||
#' if (requireNamespace('DiagrammeR', quietly = TRUE)) {
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# ggplot backend for the xgboost plotting facilities
|
||||
|
||||
|
||||
#' @rdname xgb.plot.importance
|
||||
#' @export
|
||||
xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
|
||||
@ -103,6 +102,27 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
|
||||
#' @export
|
||||
xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
|
||||
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
|
||||
if (inherits(data, "xgb.DMatrix")) {
|
||||
stop(
|
||||
"'xgb.ggplot.shap.summary' is not compatible with 'xgb.DMatrix' objects. Try passing a matrix or data.frame."
|
||||
)
|
||||
}
|
||||
cols_categ <- NULL
|
||||
if (!is.null(model)) {
|
||||
ftypes <- getinfo(model, "feature_type")
|
||||
if (NROW(ftypes)) {
|
||||
if (length(ftypes) != ncol(data)) {
|
||||
stop(sprintf("'data' has incorrect number of columns (expected: %d, got: %d).", length(ftypes), ncol(data)))
|
||||
}
|
||||
cols_categ <- colnames(data)[ftypes == "c"]
|
||||
}
|
||||
} else if (inherits(data, "data.frame")) {
|
||||
cols_categ <- names(data)[sapply(data, function(x) is.factor(x) || is.character(x))]
|
||||
}
|
||||
if (NROW(cols_categ)) {
|
||||
warning("Categorical features are ignored in 'xgb.ggplot.shap.summary'.")
|
||||
}
|
||||
|
||||
data_list <- xgb.shap.data(
|
||||
data = data,
|
||||
shap_contrib = shap_contrib,
|
||||
@ -115,6 +135,10 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
|
||||
subsample = subsample,
|
||||
max_observations = 10000 # 10,000 samples per feature.
|
||||
)
|
||||
if (NROW(cols_categ)) {
|
||||
data_list <- lapply(data_list, function(x) x[, !(colnames(x) %in% cols_categ), drop = FALSE])
|
||||
}
|
||||
|
||||
p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
|
||||
# Reverse factor levels so that the first level is at the top of the plot
|
||||
p_data[, "feature" := factor(feature, rev(levels(feature)))]
|
||||
@ -135,8 +159,8 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
|
||||
#' @param data_list The result of `xgb.shap.data()`.
|
||||
#' @param normalize Whether to standardize feature values to mean 0 and
|
||||
#' standard deviation 1. This is useful for comparing multiple features on the same
|
||||
#' plot. Default is \code{FALSE}.
|
||||
#'
|
||||
#' plot. Default is `FALSE`. Note that it cannot be used when the data contains
|
||||
#' categorical features.
|
||||
#' @return A `data.table` containing the observation ID, the feature name, the
|
||||
#' feature value (normalized if specified), and the SHAP contribution value.
|
||||
#' @noRd
|
||||
@ -167,7 +191,6 @@ prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
|
||||
#' Useful to compare multiple features on the same plot.
|
||||
#'
|
||||
#' @param x Numeric vector.
|
||||
#'
|
||||
#' @return Numeric vector with mean 0 and standard deviation 1.
|
||||
#' @noRd
|
||||
#' @keywords internal
|
||||
|
||||
@ -2,27 +2,25 @@
|
||||
#'
|
||||
#' Creates a `data.table` of feature importances.
|
||||
#'
|
||||
#' @param feature_names Character vector used to overwrite the feature names
|
||||
#' of the model. The default is `NULL` (use original feature names).
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param trees An integer vector of tree indices that should be included
|
||||
#' into the importance calculation (only for the "gbtree" booster).
|
||||
#' The default (`NULL`) parses all trees.
|
||||
#' It could be useful, e.g., in multiclass classification to get feature importances
|
||||
#' for each class separately. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' @param data Deprecated.
|
||||
#' @param label Deprecated.
|
||||
#' @param target Deprecated.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' This function works for both linear and tree models.
|
||||
#'
|
||||
#' For linear models, the importance is the absolute magnitude of linear coefficients.
|
||||
#' To obtain a meaningful ranking by importance for linear models, the features need to
|
||||
#' be on the same scale (which is also recommended when using L1 or L2 regularization).
|
||||
#'
|
||||
#' @param feature_names Character vector used to overwrite the feature names
|
||||
#' of the model. The default is `NULL` (use original feature names).
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param trees An integer vector of tree indices that should be included
|
||||
#' into the importance calculation (only for the "gbtree" booster).
|
||||
#' The default (`NULL`) parses all trees.
|
||||
#' It could be useful, e.g., in multiclass classification to get feature importances
|
||||
#' for each class separately. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' @param data Deprecated.
|
||||
#' @param label Deprecated.
|
||||
#' @param target Deprecated.
|
||||
#' @return A `data.table` with the following columns:
|
||||
#'
|
||||
#' For a tree model:
|
||||
@ -46,9 +44,8 @@
|
||||
#' # binomial classification using "gbtree":
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
@ -59,9 +56,8 @@
|
||||
#' xgb.importance(model = bst)
|
||||
#'
|
||||
#' # binomial classification using "gblinear":
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' booster = "gblinear",
|
||||
#' eta = 0.3,
|
||||
#' nthread = 1,
|
||||
@ -73,9 +69,11 @@
|
||||
#' # multiclass classification using "gbtree":
|
||||
#' nclass <- 3
|
||||
#' nrounds <- 10
|
||||
#' mbst <- xgboost(
|
||||
#' data = as.matrix(iris[, -5]),
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' mbst <- xgb.train(
|
||||
#' data = xgb.DMatrix(
|
||||
#' as.matrix(iris[, -5]),
|
||||
#' label = as.numeric(iris$Species) - 1
|
||||
#' ),
|
||||
#' max_depth = 3,
|
||||
#' eta = 0.2,
|
||||
#' nthread = 2,
|
||||
@ -99,9 +97,11 @@
|
||||
#' )
|
||||
#'
|
||||
#' # multiclass classification using "gblinear":
|
||||
#' mbst <- xgboost(
|
||||
#' data = scale(as.matrix(iris[, -5])),
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' mbst <- xgb.train(
|
||||
#' data = xgb.DMatrix(
|
||||
#' scale(as.matrix(iris[, -5])),
|
||||
#' label = as.numeric(iris$Species) - 1
|
||||
#' ),
|
||||
#' booster = "gblinear",
|
||||
#' eta = 0.2,
|
||||
#' nthread = 1,
|
||||
|
||||
@ -1,28 +1,27 @@
|
||||
#' Load xgboost model from binary file
|
||||
#' Load XGBoost model from binary file
|
||||
#'
|
||||
#' Load xgboost model from the binary model file.
|
||||
#' Load XGBoost model from binary model file.
|
||||
#'
|
||||
#' @param modelfile the name of the binary input file.
|
||||
#' @param modelfile The name of the binary input file.
|
||||
#'
|
||||
#' @details
|
||||
#' The input file is expected to contain a model saved in an xgboost model format
|
||||
#' using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
|
||||
#' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
|
||||
#' saved from there in xgboost format, could be loaded from R.
|
||||
#' The input file is expected to contain a model saved in an XGBoost model format
|
||||
#' using either [xgb.save()] in R, or using some
|
||||
#' appropriate methods from other XGBoost interfaces. E.g., a model trained in Python and
|
||||
#' saved from there in XGBoost format, could be loaded from R.
|
||||
#'
|
||||
#' Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
|
||||
#' not \code{xgb.load}.
|
||||
#' Note: a model saved as an R object has to be loaded using corresponding R-methods,
|
||||
#' not by [xgb.load()].
|
||||
#'
|
||||
#' @return
|
||||
#' An object of \code{xgb.Booster} class.
|
||||
#' An object of `xgb.Booster` class.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.save}}
|
||||
#' @seealso [xgb.save()]
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
@ -30,6 +29,7 @@
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#' Load serialised xgboost model from R's raw vector
|
||||
#' Load serialised XGBoost model from R's raw vector
|
||||
#'
|
||||
#' User can generate raw memory buffer by calling xgb.save.raw
|
||||
#' User can generate raw memory buffer by calling [xgb.save.raw()].
|
||||
#'
|
||||
#' @param buffer the buffer returned by xgb.save.raw
|
||||
#' @param buffer The buffer returned by [xgb.save.raw()].
|
||||
#' @export
|
||||
xgb.load.raw <- function(buffer) {
|
||||
cachelist <- list()
|
||||
|
||||
@ -2,18 +2,17 @@
|
||||
#'
|
||||
#' Parse a boosted tree model text dump into a `data.table` structure.
|
||||
#'
|
||||
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
|
||||
#' \link{setinfo}), they will be used in the output from this function.
|
||||
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can
|
||||
#' be set through [setinfo()]), they will be used in the output from this function.
|
||||
#' @param text Character vector previously generated by the function [xgb.dump()]
|
||||
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
|
||||
#' @param trees An integer vector of tree indices that should be used.
|
||||
#' The default (`NULL`) uses all trees.
|
||||
#' Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
|
||||
#' @param trees An integer vector of tree indices that should be used. The default
|
||||
#' (`NULL`) uses all trees. Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' @param use_int_id A logical flag indicating whether nodes in columns "Yes", "No", and
|
||||
#' "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node"
|
||||
#' character strings (when `FALSE`, default).
|
||||
#' "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node"
|
||||
#' character strings (when `FALSE`, default).
|
||||
#' @param ... Currently not used.
|
||||
#'
|
||||
#' @return
|
||||
@ -43,9 +42,8 @@
|
||||
#' nthread <- 1
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
@ -91,7 +89,7 @@ xgb.model.dt.tree <- function(model = NULL, text = NULL,
|
||||
from_text <- FALSE
|
||||
}
|
||||
|
||||
if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
|
||||
if (length(text) < 2 || !any(grepl('leaf=(-?\\d+)', text))) {
|
||||
stop("Non-tree model detected! This function can only be used with tree models.")
|
||||
}
|
||||
|
||||
@ -110,7 +108,7 @@ xgb.model.dt.tree <- function(model = NULL, text = NULL,
|
||||
} else {
|
||||
trees <- trees[trees >= 0 & trees <= max(td$Tree)]
|
||||
}
|
||||
td <- td[Tree %in% trees & !grepl('^booster', t)]
|
||||
td <- td[Tree %in% trees & !is.na(t) & !startsWith(t, 'booster')]
|
||||
|
||||
td[, Node := as.integer(sub("^([0-9]+):.*", "\\1", t))]
|
||||
if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
|
||||
@ -196,7 +194,7 @@ xgb.model.dt.tree <- function(model = NULL, text = NULL,
|
||||
td[order(Tree, Node)]
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# Avoid notes during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf", ".SD", ".SDcols"))
|
||||
|
||||
@ -4,7 +4,8 @@
|
||||
#' - `xgb.plot.deepness()` uses base R graphics, while
|
||||
#' - `xgb.ggplot.deepness()` uses "ggplot2".
|
||||
#'
|
||||
#' @param model Either an `xgb.Booster` model, or the "data.table" returned by [xgb.model.dt.tree()].
|
||||
#' @param model Either an `xgb.Booster` model, or the "data.table" returned
|
||||
#' by [xgb.model.dt.tree()].
|
||||
#' @param which Which distribution to plot (see details).
|
||||
#' @param plot Should the plot be shown? Default is `TRUE`.
|
||||
#' @param ... Other parameters passed to [graphics::barplot()] or [graphics::plot()].
|
||||
@ -48,9 +49,8 @@
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' ## Change max_depth to a higher number to get a more significant result
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 6,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 50,
|
||||
|
||||
@ -4,25 +4,9 @@
|
||||
#' - `xgb.plot.importance()` uses base R graphics, while
|
||||
#' - `xgb.ggplot.importance()` uses "ggplot".
|
||||
#'
|
||||
#' @param importance_matrix A `data.table` as returned by [xgb.importance()].
|
||||
#' @param top_n Maximal number of top features to include into the plot.
|
||||
#' @param measure The name of importance measure to plot.
|
||||
#' When `NULL`, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
|
||||
#' @param rel_to_first Whether importance values should be represented as relative to
|
||||
#' the highest ranked feature, see Details.
|
||||
#' @param left_margin Adjust the left margin size to fit feature names.
|
||||
#' When `NULL`, the existing `par("mar")` is used.
|
||||
#' @param cex Passed as `cex.names` parameter to [graphics::barplot()].
|
||||
#' @param plot Should the barplot be shown? Default is `TRUE`.
|
||||
#' @param n_clusters A numeric vector containing the min and the max range
|
||||
#' of the possible number of clusters of bars.
|
||||
#' @param ... Other parameters passed to [graphics::barplot()]
|
||||
#' (except `horiz`, `border`, `cex.names`, `names.arg`, and `las`).
|
||||
#' Only used in `xgb.plot.importance()`.
|
||||
#'
|
||||
#' @details
|
||||
#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
||||
#' Features are sorted by decreasing importance.
|
||||
#' The graph represents each feature as a horizontal bar of length proportional to the
|
||||
#' importance of a feature. Features are sorted by decreasing importance.
|
||||
#' It works for both "gblinear" and "gbtree" models.
|
||||
#'
|
||||
#' When `rel_to_first = FALSE`, the values would be plotted as in `importance_matrix`.
|
||||
@ -35,6 +19,21 @@
|
||||
#' The "ggplot" backend performs 1-D clustering of the importance values,
|
||||
#' with bar colors corresponding to different clusters having similar importance values.
|
||||
#'
|
||||
#' @param importance_matrix A `data.table` as returned by [xgb.importance()].
|
||||
#' @param top_n Maximal number of top features to include into the plot.
|
||||
#' @param measure The name of importance measure to plot.
|
||||
#' When `NULL`, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
|
||||
#' @param rel_to_first Whether importance values should be represented as relative to
|
||||
#' the highest ranked feature, see Details.
|
||||
#' @param left_margin Adjust the left margin size to fit feature names.
|
||||
#' When `NULL`, the existing `par("mar")` is used.
|
||||
#' @param cex Passed as `cex.names` parameter to [graphics::barplot()].
|
||||
#' @param plot Should the barplot be shown? Default is `TRUE`.
|
||||
#' @param n_clusters A numeric vector containing the min and the max range
|
||||
#' of the possible number of clusters of bars.
|
||||
#' @param ... Other parameters passed to [graphics::barplot()]
|
||||
#' (except `horiz`, `border`, `cex.names`, `names.arg`, and `las`).
|
||||
#' Only used in `xgb.plot.importance()`.
|
||||
#' @return
|
||||
#' The return value depends on the function:
|
||||
#' - `xgb.plot.importance()`: Invisibly, a "data.table" with `n_top` features sorted
|
||||
@ -51,9 +50,8 @@
|
||||
#' nthread <- 2
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 3,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
|
||||
@ -2,12 +2,7 @@
|
||||
#'
|
||||
#' Visualization of the ensemble of trees as a single collective unit.
|
||||
#'
|
||||
#' @inheritParams xgb.plot.tree
|
||||
#' @param features_keep Number of features to keep in each position of the multi trees,
|
||||
#' by default 5.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' This function tries to capture the complexity of a gradient boosted tree model
|
||||
#' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
|
||||
#' The goal is to improve the interpretability of a model generally seen as black box.
|
||||
@ -25,6 +20,9 @@
|
||||
#' This function is inspired by this blog post:
|
||||
#' <https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/>
|
||||
#'
|
||||
#' @inheritParams xgb.plot.tree
|
||||
#' @param features_keep Number of features to keep in each position of the multi trees,
|
||||
#' by default 5.
|
||||
#' @inherit xgb.plot.tree return
|
||||
#'
|
||||
#' @examples
|
||||
@ -35,9 +33,8 @@
|
||||
#' nthread <- 2
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 15,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
@ -112,11 +109,10 @@ xgb.plot.multi.trees <- function(model, features_keep = 5, plot_width = NULL, pl
|
||||
|
||||
edges.dt <- data.table::rbindlist(
|
||||
l = list(
|
||||
tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)],
|
||||
tree.matrix[Feature != "Leaf", .(abs.node.position, No)]
|
||||
tree.matrix[Feature != "Leaf", .(From = abs.node.position, To = Yes)],
|
||||
tree.matrix[Feature != "Leaf", .(From = abs.node.position, To = No)]
|
||||
)
|
||||
)
|
||||
data.table::setnames(edges.dt, c("From", "To"))
|
||||
edges.dt <- edges.dt[, .N, .(From, To)]
|
||||
edges.dt[, N := NULL]
|
||||
|
||||
|
||||
@ -2,44 +2,43 @@
|
||||
#'
|
||||
#' Visualizes SHAP values against feature values to gain an impression of feature effects.
|
||||
#'
|
||||
#' @param data The data to explain as a `matrix` or `dgCMatrix`.
|
||||
#' @param data The data to explain as a `matrix`, `dgCMatrix`, or `data.frame`.
|
||||
#' @param shap_contrib Matrix of SHAP contributions of `data`.
|
||||
#' The default (`NULL`) computes it from `model` and `data`.
|
||||
#' @param features Vector of column indices or feature names to plot.
|
||||
#' When `NULL` (default), the `top_n` most important features are selected
|
||||
#' by [xgb.importance()].
|
||||
#' The default (`NULL`) computes it from `model` and `data`.
|
||||
#' @param features Vector of column indices or feature names to plot. When `NULL`
|
||||
#' (default), the `top_n` most important features are selected by [xgb.importance()].
|
||||
#' @param top_n How many of the most important features (<= 100) should be selected?
|
||||
#' By default 1 for SHAP dependence and 10 for SHAP summary).
|
||||
#' Only used when `features = NULL`.
|
||||
#' By default 1 for SHAP dependence and 10 for SHAP summary.
|
||||
#' Only used when `features = NULL`.
|
||||
#' @param model An `xgb.Booster` model. Only required when `shap_contrib = NULL` or
|
||||
#' `features = NULL`.
|
||||
#' `features = NULL`.
|
||||
#' @param trees Passed to [xgb.importance()] when `features = NULL`.
|
||||
#' @param target_class Only relevant for multiclass models. The default (`NULL`)
|
||||
#' averages the SHAP values over all classes. Pass a (0-based) class index
|
||||
#' to show only SHAP values of that class.
|
||||
#' averages the SHAP values over all classes. Pass a (0-based) class index
|
||||
#' to show only SHAP values of that class.
|
||||
#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`.
|
||||
#' @param subsample Fraction of data points randomly picked for plotting.
|
||||
#' The default (`NULL`) will use up to 100k data points.
|
||||
#' The default (`NULL`) will use up to 100k data points.
|
||||
#' @param n_col Number of columns in a grid of plots.
|
||||
#' @param col Color of the scatterplot markers.
|
||||
#' @param pch Scatterplot marker.
|
||||
#' @param discrete_n_uniq Maximal number of unique feature values to consider the
|
||||
#' feature as discrete.
|
||||
#' feature as discrete.
|
||||
#' @param discrete_jitter Jitter amount added to the values of discrete features.
|
||||
#' @param ylab The y-axis label in 1D plots.
|
||||
#' @param plot_NA Should contributions of cases with missing values be plotted?
|
||||
#' Default is `TRUE`.
|
||||
#' Default is `TRUE`.
|
||||
#' @param col_NA Color of marker for missing value contributions.
|
||||
#' @param pch_NA Marker type for `NA` values.
|
||||
#' @param pos_NA Relative position of the x-location where `NA` values are shown:
|
||||
#' `min(x) + (max(x) - min(x)) * pos_NA`.
|
||||
#' `min(x) + (max(x) - min(x)) * pos_NA`.
|
||||
#' @param plot_loess Should loess-smoothed curves be plotted? (Default is `TRUE`).
|
||||
#' The smoothing is only done for features with more than 5 distinct values.
|
||||
#' The smoothing is only done for features with more than 5 distinct values.
|
||||
#' @param col_loess Color of loess curves.
|
||||
#' @param span_loess The `span` parameter of [stats::loess()].
|
||||
#' @param which Whether to do univariate or bivariate plotting. Currently, only "1d" is implemented.
|
||||
#' @param plot Should the plot be drawn? (Default is `TRUE`).
|
||||
#' If `FALSE`, only a list of matrices is returned.
|
||||
#' If `FALSE`, only a list of matrices is returned.
|
||||
#' @param ... Other parameters passed to [graphics::plot()].
|
||||
#'
|
||||
#' @details
|
||||
@ -82,9 +81,8 @@
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#' nrounds <- 20
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' agaricus.train$data,
|
||||
#' agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||
#' nrounds = nrounds,
|
||||
#' eta = 0.1,
|
||||
#' max_depth = 3,
|
||||
@ -108,9 +106,8 @@
|
||||
#' set.seed(123)
|
||||
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||
#'
|
||||
#' mbst <- xgboost(
|
||||
#' data = x,
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' mbst <- xgb.train(
|
||||
#' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
|
||||
#' nrounds = nrounds,
|
||||
#' max_depth = 2,
|
||||
#' eta = 0.3,
|
||||
@ -122,6 +119,7 @@
|
||||
#' )
|
||||
#' trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
|
||||
#' col <- rgb(0, 0, 1, 0.5)
|
||||
#'
|
||||
#' xgb.plot.shap(
|
||||
#' x,
|
||||
#' model = mbst,
|
||||
@ -287,8 +285,11 @@ xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, to
|
||||
xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
|
||||
trees = NULL, target_class = NULL, approxcontrib = FALSE,
|
||||
subsample = NULL, max_observations = 100000) {
|
||||
if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
|
||||
stop("data: must be either matrix or dgCMatrix")
|
||||
if (!inherits(data, c("matrix", "dsparseMatrix", "data.frame")))
|
||||
stop("data: must be matrix, sparse matrix, or data.frame.")
|
||||
if (inherits(data, "data.frame") && length(class(data)) > 1L) {
|
||||
data <- as.data.frame(data)
|
||||
}
|
||||
|
||||
if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
|
||||
stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
|
||||
@ -296,8 +297,10 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
|
||||
stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
|
||||
|
||||
last_dim <- function(v) dim(v)[length(dim(v))]
|
||||
|
||||
if (!is.null(shap_contrib) &&
|
||||
(!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
|
||||
(!is.array(shap_contrib) || nrow(shap_contrib) != nrow(data) || last_dim(shap_contrib) != ncol(data) + 1))
|
||||
stop("shap_contrib is not compatible with the provided data")
|
||||
|
||||
if (is.character(features) && is.null(colnames(data)))
|
||||
@ -311,7 +314,14 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
stop("if model has no feature_names, columns in `data` must match features in model")
|
||||
|
||||
if (!is.null(subsample)) {
|
||||
idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE)
|
||||
if (subsample <= 0 || subsample >= 1) {
|
||||
stop("'subsample' must be a number between zero and one (non-inclusive).")
|
||||
}
|
||||
sample_size <- as.integer(subsample * nrow(data))
|
||||
if (sample_size < 2) {
|
||||
stop("Sampling fraction involves less than 2 rows.")
|
||||
}
|
||||
idx <- sample(x = seq_len(nrow(data)), size = sample_size, replace = FALSE)
|
||||
} else {
|
||||
idx <- seq_len(min(nrow(data), max_observations))
|
||||
}
|
||||
@ -320,19 +330,39 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
colnames(data) <- paste0("X", seq_len(ncol(data)))
|
||||
}
|
||||
|
||||
if (!is.null(shap_contrib)) {
|
||||
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
|
||||
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
|
||||
}
|
||||
shap_contrib <- shap_contrib[idx, ]
|
||||
if (is.null(colnames(shap_contrib))) {
|
||||
colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
|
||||
}
|
||||
} else {
|
||||
shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib)
|
||||
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
|
||||
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
|
||||
reshape_3d_shap_contrib <- function(shap_contrib, target_class) {
|
||||
# multiclass: either choose a class or merge
|
||||
if (is.list(shap_contrib)) {
|
||||
if (!is.null(target_class)) {
|
||||
shap_contrib <- shap_contrib[[target_class + 1]]
|
||||
} else {
|
||||
shap_contrib <- Reduce("+", lapply(shap_contrib, abs))
|
||||
}
|
||||
} else if (length(dim(shap_contrib)) > 2) {
|
||||
if (!is.null(target_class)) {
|
||||
orig_shape <- dim(shap_contrib)
|
||||
shap_contrib <- shap_contrib[, target_class + 1, , drop = TRUE]
|
||||
if (!is.matrix(shap_contrib)) {
|
||||
shap_contrib <- matrix(shap_contrib, orig_shape[c(1L, 3L)])
|
||||
}
|
||||
} else {
|
||||
shap_contrib <- apply(abs(shap_contrib), c(1L, 3L), sum)
|
||||
}
|
||||
}
|
||||
return(shap_contrib)
|
||||
}
|
||||
|
||||
if (is.null(shap_contrib)) {
|
||||
shap_contrib <- predict(
|
||||
model,
|
||||
newdata = data,
|
||||
predcontrib = TRUE,
|
||||
approxcontrib = approxcontrib
|
||||
)
|
||||
}
|
||||
shap_contrib <- reshape_3d_shap_contrib(shap_contrib, target_class)
|
||||
if (is.null(colnames(shap_contrib))) {
|
||||
colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
|
||||
}
|
||||
|
||||
if (is.null(features)) {
|
||||
|
||||
@ -2,36 +2,7 @@
|
||||
#'
|
||||
#' Read a tree model text dump and plot the model.
|
||||
#'
|
||||
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
|
||||
#' \link{setinfo}), they will be used in the output from this function.
|
||||
#' @param trees An integer vector of tree indices that should be used.
|
||||
#' The default (`NULL`) uses all trees.
|
||||
#' Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:2` for the first three trees).
|
||||
#' @param plot_width,plot_height Width and height of the graph in pixels.
|
||||
#' The values are passed to [DiagrammeR::render_graph()].
|
||||
#' @param render Should the graph be rendered or not? The default is `TRUE`.
|
||||
#' @param show_node_id a logical flag for whether to show node id's in the graph.
|
||||
#' @param style Style to use for the plot. Options are:\itemize{
|
||||
#' \item `"xgboost"`: will use the plot style defined in the core XGBoost library,
|
||||
#' which is shared between different interfaces through the 'dot' format. This
|
||||
#' style was not available before version 2.1.0 in R. It always plots the trees
|
||||
#' vertically (from top to bottom).
|
||||
#' \item `"R"`: will use the style defined from XGBoost's R interface, which predates
|
||||
#' the introducition of the standardized style from the core library. It might plot
|
||||
#' the trees horizontally (from left to right).
|
||||
#' }
|
||||
#'
|
||||
#' Note that `style="xgboost"` is only supported when all of the following conditions are met:\itemize{
|
||||
#' \item Only a single tree is being plotted.
|
||||
#' \item Node IDs are not added to the graph.
|
||||
#' \item The graph is being returned as `htmlwidget` (`render=TRUE`).
|
||||
#' }
|
||||
#' @param ... currently not used.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' When using `style="xgboost"`, the content of each node is visualized as follows:
|
||||
#' - For non-terminal nodes, it will display the split condition (number or name if
|
||||
#' available, and the condition that would decide to which node to go next).
|
||||
@ -56,6 +27,31 @@
|
||||
#'
|
||||
#' This function uses [GraphViz](https://www.graphviz.org/) as DiagrammeR backend.
|
||||
#'
|
||||
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
|
||||
#' [setinfo()], they will be used in the output from this function.
|
||||
#' @param trees An integer vector of tree indices that should be used.
|
||||
#' The default (`NULL`) uses all trees.
|
||||
#' Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:2` for the first three trees).
|
||||
#' @param plot_width,plot_height Width and height of the graph in pixels.
|
||||
#' The values are passed to `DiagrammeR::render_graph()`.
|
||||
#' @param render Should the graph be rendered or not? The default is `TRUE`.
|
||||
#' @param show_node_id a logical flag for whether to show node id's in the graph.
|
||||
#' @param style Style to use for the plot:
|
||||
#' - `"xgboost"`: will use the plot style defined in the core XGBoost library,
|
||||
#' which is shared between different interfaces through the 'dot' format. This
|
||||
#' style was not available before version 2.1.0 in R. It always plots the trees
|
||||
#' vertically (from top to bottom).
|
||||
#' - `"R"`: will use the style defined from XGBoost's R interface, which predates
|
||||
#' the introducition of the standardized style from the core library. It might plot
|
||||
#' the trees horizontally (from left to right).
|
||||
#'
|
||||
#' Note that `style="xgboost"` is only supported when all of the following conditions are met:
|
||||
#' - Only a single tree is being plotted.
|
||||
#' - Node IDs are not added to the graph.
|
||||
#' - The graph is being returned as `htmlwidget` (`render=TRUE`).
|
||||
#' @param ... Currently not used.
|
||||
#' @return
|
||||
#' The value depends on the `render` parameter:
|
||||
#' - If `render = TRUE` (default): Rendered graph object which is an htmlwidget of
|
||||
@ -63,14 +59,13 @@
|
||||
#' running from the command line.
|
||||
#' - If `render = FALSE`: Graph object which is of DiagrammeR's class `dgr_graph`.
|
||||
#' This could be useful if one wants to modify some of the graph attributes
|
||||
#' before rendering the graph with [DiagrammeR::render_graph()].
|
||||
#' before rendering the graph with `DiagrammeR::render_graph()`.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||
#' max_depth = 3,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
|
||||
@ -1,43 +1,39 @@
|
||||
#' Save xgboost model to binary file
|
||||
#' Save XGBoost model to binary file
|
||||
#'
|
||||
#' Save xgboost model to a file in binary or JSON format.
|
||||
#' Save XGBoost model to a file in binary or JSON format.
|
||||
#'
|
||||
#' @param model Model object of \code{xgb.Booster} class.
|
||||
#' @param fname Name of the file to write.
|
||||
#'
|
||||
#' Note that the extension of this file name determined the serialization format to use:\itemize{
|
||||
#' \item Extension ".ubj" will use the universal binary JSON format (recommended).
|
||||
#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
#' of precision when converting to a human-readable JSON text or similar.
|
||||
#' \item Extension ".json" will use plain JSON, which is a human-readable format.
|
||||
#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
|
||||
#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
#' \item If the format is not specified by passing one of the file extensions above, will
|
||||
#' default to UBJ.
|
||||
#' }
|
||||
#' @param model Model object of `xgb.Booster` class.
|
||||
#' @param fname Name of the file to write. Its extension determines the serialization format:
|
||||
#' - ".ubj": Use the universal binary JSON format (recommended).
|
||||
#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
#' of precision when converting to a human-readable JSON text or similar.
|
||||
#' - ".json": Use plain JSON, which is a human-readable format.
|
||||
#' - ".deprecated": Use **deprecated** binary format. This format will
|
||||
#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
#' - If the format is not specified by passing one of the file extensions above, will
|
||||
#' default to UBJ.
|
||||
#'
|
||||
#' @details
|
||||
#' This methods allows to save a model in an xgboost-internal binary or text format which is universal
|
||||
#' among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||
#' of \code{\link{xgb.train}}.
|
||||
#'
|
||||
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
|
||||
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
#' how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
#' This methods allows to save a model in an XGBoost-internal binary or text format which is universal
|
||||
#' among the various xgboost interfaces. In R, the saved model file could be read later
|
||||
#' using either the [xgb.load()] function or the `xgb_model` parameter of [xgb.train()].
|
||||
#'
|
||||
#' Note: a model can also be saved as an R object (e.g., by using [readRDS()]
|
||||
#' or [save()]). However, it would then only be compatible with R, and
|
||||
#' corresponding R methods would need to be used to load it. Moreover, persisting the model with
|
||||
#' [readRDS()] or [save()] might cause compatibility problems in
|
||||
#' future versions of XGBoost. Consult [a-compatibility-note-for-saveRDS-save] to learn
|
||||
#' how to persist models in a future-proof way, i.e., to make the model accessible in future
|
||||
#' releases of XGBoost.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.load}}
|
||||
#' @seealso [xgb.load()]
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
@ -45,6 +41,7 @@
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
@ -53,6 +50,7 @@
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' fname <- file.path(tempdir(), "xgb.ubj")
|
||||
#' xgb.save(bst, fname)
|
||||
#' bst <- xgb.load(fname)
|
||||
|
||||
@ -1,29 +1,34 @@
|
||||
#' Save xgboost model to R's raw vector,
|
||||
#' user can call xgb.load.raw to load the model back from raw vector
|
||||
#' Save XGBoost model to R's raw vector
|
||||
#'
|
||||
#' Save xgboost model from xgboost or xgb.train
|
||||
#' Save XGBoost model from [xgboost()] or [xgb.train()].
|
||||
#' Call [xgb.load.raw()] to load the model back from raw vector.
|
||||
#'
|
||||
#' @param model the model object.
|
||||
#' @param raw_format The format for encoding the booster. Available options are
|
||||
#' \itemize{
|
||||
#' \item \code{json}: Encode the booster into JSON text document.
|
||||
#' \item \code{ubj}: Encode the booster into Universal Binary JSON.
|
||||
#' \item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
#' }
|
||||
#' @param model The model object.
|
||||
#' @param raw_format The format for encoding the booster:
|
||||
#' - "json": Encode the booster into JSON text document.
|
||||
#' - "ubj": Encode the booster into Universal Binary JSON.
|
||||
#' - "deprecated": Encode the booster into old customized binary format.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 2 for examples
|
||||
#' nthread <- 2
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' raw <- xgb.save.raw(bst)
|
||||
#' bst <- xgb.load.raw(raw)
|
||||
|
||||
@ -1,183 +1,186 @@
|
||||
#' eXtreme Gradient Boosting Training
|
||||
#'
|
||||
#' \code{xgb.train} is an advanced interface for training an xgboost model.
|
||||
#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
|
||||
#' `xgb.train()` is an advanced interface for training an xgboost model.
|
||||
#' The [xgboost()] function is a simpler wrapper for `xgb.train()`.
|
||||
#'
|
||||
#' @param params the list of parameters. The complete list of parameters is
|
||||
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
#' is a shorter summary:
|
||||
#' available in the [online documentation](http://xgboost.readthedocs.io/en/latest/parameter.html).
|
||||
#' Below is a shorter summary:
|
||||
#'
|
||||
#' 1. General Parameters
|
||||
#' **1. General Parameters**
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
|
||||
#' }
|
||||
#' - `booster`: Which booster to use, can be `gbtree` or `gblinear`. Default: `gbtree`.
|
||||
#'
|
||||
#' 2. Booster Parameters
|
||||
#' **2. Booster Parameters**
|
||||
#'
|
||||
#' 2.1. Parameters for Tree Booster
|
||||
#' **2.1. Parameters for Tree Booster**
|
||||
#' - `eta`: The learning rate: scale the contribution of each tree by a factor of `0 < eta < 1`
|
||||
#' when it is added to the current approximation.
|
||||
#' Used to prevent overfitting by making the boosting process more conservative.
|
||||
#' Lower value for `eta` implies larger value for `nrounds`: low `eta` value means model
|
||||
#' more robust to overfitting but slower to compute. Default: 0.3.
|
||||
#' - `gamma`: Minimum loss reduction required to make a further partition on a leaf node of the tree.
|
||||
#' the larger, the more conservative the algorithm will be.
|
||||
#' - `max_depth`: Maximum depth of a tree. Default: 6.
|
||||
#' - `min_child_weight`: Minimum sum of instance weight (hessian) needed in a child.
|
||||
#' If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
|
||||
#' then the building process will give up further partitioning.
|
||||
#' In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node.
|
||||
#' The larger, the more conservative the algorithm will be. Default: 1.
|
||||
#' - `subsample`: Subsample ratio of the training instance.
|
||||
#' Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees
|
||||
#' and this will prevent overfitting. It makes computation shorter (because less data to analyse).
|
||||
#' It is advised to use this parameter with `eta` and increase `nrounds`. Default: 1.
|
||||
#' - `colsample_bytree`: Subsample ratio of columns when constructing each tree. Default: 1.
|
||||
#' - `lambda`: L2 regularization term on weights. Default: 1.
|
||||
#' - `alpha`: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0.
|
||||
#' - `num_parallel_tree`: Experimental parameter. number of trees to grow per round.
|
||||
#' Useful to test Random Forest through XGBoost.
|
||||
#' (set `colsample_bytree < 1`, `subsample < 1` and `round = 1`) accordingly.
|
||||
#' Default: 1.
|
||||
#' - `monotone_constraints`: A numerical vector consists of `1`, `0` and `-1` with its length
|
||||
#' equals to the number of features in the training data.
|
||||
#' `1` is increasing, `-1` is decreasing and `0` is no constraint.
|
||||
#' - `interaction_constraints`: A list of vectors specifying feature indices of permitted interactions.
|
||||
#' Each item of the list represents one permitted interaction where specified features are allowed to interact with each other.
|
||||
#' Feature index values should start from `0` (`0` references the first column).
|
||||
#' Leave argument unspecified for no interaction constraints.
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item{ \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1}
|
||||
#' when it is added to the current approximation.
|
||||
#' Used to prevent overfitting by making the boosting process more conservative.
|
||||
#' Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model
|
||||
#' more robust to overfitting but slower to compute. Default: 0.3}
|
||||
#' \item{ \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree.
|
||||
#' the larger, the more conservative the algorithm will be.}
|
||||
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
#' \item{\code{min_child_weight} minimum sum of instance weight (hessian) needed in a child.
|
||||
#' If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
|
||||
#' then the building process will give up further partitioning.
|
||||
#' In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node.
|
||||
#' The larger, the more conservative the algorithm will be. Default: 1}
|
||||
#' \item{ \code{subsample} subsample ratio of the training instance.
|
||||
#' Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees
|
||||
#' and this will prevent overfitting. It makes computation shorter (because less data to analyse).
|
||||
#' It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1}
|
||||
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
#' \item \code{lambda} L2 regularization term on weights. Default: 1
|
||||
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
#' \item{ \code{num_parallel_tree} Experimental parameter. number of trees to grow per round.
|
||||
#' Useful to test Random Forest through XGBoost
|
||||
#' (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly.
|
||||
#' Default: 1}
|
||||
#' \item{ \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length
|
||||
#' equals to the number of features in the training data.
|
||||
#' \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.}
|
||||
#' \item{ \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions.
|
||||
#' Each item of the list represents one permitted interaction where specified features are allowed to interact with each other.
|
||||
#' Feature index values should start from \code{0} (\code{0} references the first column).
|
||||
#' Leave argument unspecified for no interaction constraints.}
|
||||
#' }
|
||||
#' **2.2. Parameters for Linear Booster**
|
||||
#'
|
||||
#' 2.2. Parameters for Linear Booster
|
||||
#' - `lambda`: L2 regularization term on weights. Default: 0.
|
||||
#' - `lambda_bias`: L2 regularization term on bias. Default: 0.
|
||||
#' - `alpha`: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0.
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{lambda} L2 regularization term on weights. Default: 0
|
||||
#' \item \code{lambda_bias} L2 regularization term on bias. Default: 0
|
||||
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
#' }
|
||||
#' **3. Task Parameters**
|
||||
#'
|
||||
#' 3. Task Parameters
|
||||
#' - `objective`: Specifies the learning task and the corresponding learning objective.
|
||||
#' users can pass a self-defined function to it. The default objective options are below:
|
||||
#' - `reg:squarederror`: Regression with squared loss (default).
|
||||
#' - `reg:squaredlogerror`: Regression with squared log loss \eqn{1/2 \cdot (\log(pred + 1) - \log(label + 1))^2}.
|
||||
#' All inputs are required to be greater than -1.
|
||||
#' Also, see metric rmsle for possible issue with this objective.
|
||||
#' - `reg:logistic`: Logistic regression.
|
||||
#' - `reg:pseudohubererror`: Regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
#' - `binary:logistic`: Logistic regression for binary classification. Output probability.
|
||||
#' - `binary:logitraw`: Logistic regression for binary classification, output score before logistic transformation.
|
||||
#' - `binary:hinge`: Hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
#' - `count:poisson`: Poisson regression for count data, output mean of Poisson distribution.
|
||||
#' The parameter `max_delta_step` is set to 0.7 by default in poisson regression
|
||||
#' (used to safeguard optimization).
|
||||
#' - `survival:cox`: Cox regression for right censored survival time data (negative values are considered right censored).
|
||||
#' Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional
|
||||
#' hazard function \eqn{h(t) = h_0(t) \cdot HR}.
|
||||
#' - `survival:aft`: Accelerated failure time model for censored survival time data. See
|
||||
#' [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html)
|
||||
#' for details.
|
||||
#' The parameter `aft_loss_distribution` specifies the Probability Density Function
|
||||
#' used by `survival:aft` and the `aft-nloglik` metric.
|
||||
#' - `multi:softmax`: Set xgboost to do multiclass classification using the softmax objective.
|
||||
#' Class is represented by a number and should be from 0 to `num_class - 1`.
|
||||
#' - `multi:softprob`: Same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be
|
||||
#' further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging
|
||||
#' to each class.
|
||||
#' - `rank:pairwise`: Set XGBoost to do ranking task by minimizing the pairwise loss.
|
||||
#' - `rank:ndcg`: Use LambdaMART to perform list-wise ranking where
|
||||
#' [Normalized Discounted Cumulative Gain (NDCG)](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) is maximized.
|
||||
#' - `rank:map`: Use LambdaMART to perform list-wise ranking where
|
||||
#' [Mean Average Precision (MAP)](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
|
||||
#' is maximized.
|
||||
#' - `reg:gamma`: Gamma regression with log-link. Output is a mean of gamma distribution.
|
||||
#' It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be
|
||||
#' [gamma-distributed](https://en.wikipedia.org/wiki/Gamma_distribution#Applications).
|
||||
#' - `reg:tweedie`: Tweedie regression with log-link.
|
||||
#' It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
|
||||
#' [Tweedie-distributed](https://en.wikipedia.org/wiki/Tweedie_distribution#Applications).
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item{ \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it.
|
||||
#' The default objective options are below:
|
||||
#' \itemize{
|
||||
#' \item \code{reg:squarederror} Regression with squared loss (Default).
|
||||
#' \item{ \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}.
|
||||
#' All inputs are required to be greater than -1.
|
||||
#' Also, see metric rmsle for possible issue with this objective.}
|
||||
#' \item \code{reg:logistic} logistic regression.
|
||||
#' \item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
#' \item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
#' \item{ \code{count:poisson}: Poisson regression for count data, output mean of Poisson distribution.
|
||||
#' \code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).}
|
||||
#' \item{ \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored).
|
||||
#' Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional
|
||||
#' hazard function \code{h(t) = h0(t) * HR)}.}
|
||||
#' \item{ \code{survival:aft}: Accelerated failure time model for censored survival time data. See
|
||||
#' \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time}
|
||||
#' for details.}
|
||||
#' \item \code{aft_loss_distribution}: Probability Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
|
||||
#' \item{ \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective.
|
||||
#' Class is represented by a number and should be from 0 to \code{num_class - 1}.}
|
||||
#' \item{ \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be
|
||||
#' further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging
|
||||
#' to each class.}
|
||||
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
#' \item{ \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where
|
||||
#' \href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.}
|
||||
#' \item{ \code{rank:map}: Use LambdaMART to perform list-wise ranking where
|
||||
#' \href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)}
|
||||
#' is maximized.}
|
||||
#' \item{ \code{reg:gamma}: gamma regression with log-link.
|
||||
#' Output is a mean of gamma distribution.
|
||||
#' It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be
|
||||
#' \href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.}
|
||||
#' \item{ \code{reg:tweedie}: Tweedie regression with log-link.
|
||||
#' It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
|
||||
#' \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
|
||||
#' }
|
||||
#' }
|
||||
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
#' \item{ \code{eval_metric} evaluation metrics for validation data.
|
||||
#' Users can pass a self-defined function to it.
|
||||
#' Default: metric will be assigned according to objective
|
||||
#' (rmse for regression, and error for classification, mean average precision for ranking).
|
||||
#' List is provided in detail section.}
|
||||
#' }
|
||||
#' For custom objectives, one should pass a function taking as input the current predictions (as a numeric
|
||||
#' vector or matrix) and the training data (as an `xgb.DMatrix` object) that will return a list with elements
|
||||
#' `grad` and `hess`, which should be numeric vectors or matrices with number of rows matching to the numbers
|
||||
#' of rows in the training data (same shape as the predictions that are passed as input to the function).
|
||||
#' For multi-valued custom objectives, should have shape `[nrows, ntargets]`. Note that negative values of
|
||||
#' the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the
|
||||
#' objective is non-convex.
|
||||
#'
|
||||
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
|
||||
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
|
||||
#' @param nrounds max number of boosting iterations.
|
||||
#' See the tutorials [Custom Objective and Evaluation Metric](https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html)
|
||||
#' and [Advanced Usage of Custom Objectives](https://xgboost.readthedocs.io/en/stable/tutorials/advanced_custom_obj)
|
||||
#' for more information about custom objectives.
|
||||
#'
|
||||
#' - `base_score`: The initial prediction score of all instances, global bias. Default: 0.5.
|
||||
#' - `eval_metric`: Evaluation metrics for validation data.
|
||||
#' Users can pass a self-defined function to it.
|
||||
#' Default: metric will be assigned according to objective
|
||||
#' (rmse for regression, and error for classification, mean average precision for ranking).
|
||||
#' List is provided in detail section.
|
||||
#' @param data Training dataset. `xgb.train()` accepts only an `xgb.DMatrix` as the input.
|
||||
#' [xgboost()], in addition, also accepts `matrix`, `dgCMatrix`, or name of a local data file.
|
||||
#' @param nrounds Max number of boosting iterations.
|
||||
#' @param evals Named list of `xgb.DMatrix` datasets to use for evaluating model performance.
|
||||
#' Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
|
||||
#' of these datasets during each boosting iteration, and stored in the end as a field named
|
||||
#' \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
|
||||
#' \code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
|
||||
#' printed out during the training.
|
||||
#' E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
|
||||
#' the performance of each round's model on mat1 and mat2.
|
||||
#' @param obj customized objective function. Returns gradient and second order
|
||||
#' gradient with given prediction and dtrain.
|
||||
#' @param feval customized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain.
|
||||
#' Metrics specified in either `eval_metric` or `feval` will be computed for each
|
||||
#' of these datasets during each boosting iteration, and stored in the end as a field named
|
||||
#' `evaluation_log` in the resulting object. When either `verbose>=1` or
|
||||
#' [xgb.cb.print.evaluation()] callback is engaged, the performance results are continuously
|
||||
#' printed out during the training.
|
||||
#' E.g., specifying `evals=list(validation1=mat1, validation2=mat2)` allows to track
|
||||
#' the performance of each round's model on mat1 and mat2.
|
||||
#' @param obj Customized objective function. Should take two arguments: the first one will be the
|
||||
#' current predictions (either a numeric vector or matrix depending on the number of targets / classes),
|
||||
#' and the second one will be the `data` DMatrix object that is used for training.
|
||||
#'
|
||||
#' It should return a list with two elements `grad` and `hess` (in that order), as either
|
||||
#' numeric vectors or numeric matrices depending on the number of targets / classes (same
|
||||
#' dimension as the predictions that are passed as first argument).
|
||||
#' @param feval Customized evaluation function. Just like `obj`, should take two arguments, with
|
||||
#' the first one being the predictions and the second one the `data` DMatrix.
|
||||
#'
|
||||
#' Should return a list with two elements `metric` (name that will be displayed for this metric,
|
||||
#' should be a string / character), and `value` (the number that the function calculates, should
|
||||
#' be a numeric scalar).
|
||||
#'
|
||||
#' Note that even if passing `feval`, objectives also have an associated default metric that
|
||||
#' will be evaluated in addition to it. In order to disable the built-in metric, one can pass
|
||||
#' parameter `disable_default_eval_metric = TRUE`.
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
|
||||
#' If 2, some additional information will be printed out.
|
||||
#' Note that setting \code{verbose > 0} automatically engages the
|
||||
#' \code{xgb.cb.print.evaluation(period=1)} callback function.
|
||||
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
#' \code{\link{xgb.cb.print.evaluation}} callback.
|
||||
#' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
|
||||
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||
#' doesn't improve for \code{k} rounds.
|
||||
#' Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
|
||||
#' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
|
||||
#' then this parameter must be set as well.
|
||||
#' When it is \code{TRUE}, it means the larger the evaluation score the better.
|
||||
#' This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
|
||||
#' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
|
||||
#' 0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.
|
||||
#' If 2, some additional information will be printed out.
|
||||
#' Note that setting `verbose > 0` automatically engages the
|
||||
#' `xgb.cb.print.evaluation(period=1)` callback function.
|
||||
#' @param print_every_n Print each nth iteration evaluation messages when `verbose>0`.
|
||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
#' [xgb.cb.print.evaluation()] callback.
|
||||
#' @param early_stopping_rounds If `NULL`, the early stopping function is not triggered.
|
||||
#' If set to an integer `k`, training with a validation set will stop if the performance
|
||||
#' doesn't improve for `k` rounds. Setting this parameter engages the [xgb.cb.early.stop()] callback.
|
||||
#' @param maximize If `feval` and `early_stopping_rounds` are set, then this parameter must be set as well.
|
||||
#' When it is `TRUE`, it means the larger the evaluation score the better.
|
||||
#' This parameter is passed to the [xgb.cb.early.stop()] callback.
|
||||
#' @param save_period When not `NULL`, model is saved to disk after every `save_period` rounds.
|
||||
#' 0 means save at the end. The saving is handled by the [xgb.cb.save.model()] callback.
|
||||
#' @param save_name the name or path for periodically saved model file.
|
||||
#' @param xgb_model a previously built model to continue the training from.
|
||||
#' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
|
||||
#' file with a previously saved model.
|
||||
#' @param callbacks a list of callback functions to perform various task during boosting.
|
||||
#' See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#' @param xgb_model A previously built model to continue the training from.
|
||||
#' Could be either an object of class `xgb.Booster`, or its raw data, or the name of a
|
||||
#' file with a previously saved model.
|
||||
#' @param callbacks A list of callback functions to perform various task during boosting.
|
||||
#' See [xgb.Callback()]. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#'
|
||||
#' Note that some callbacks might try to leave attributes in the resulting model object,
|
||||
#' such as an evaluation log (a `data.table` object) - be aware that these objects are kept
|
||||
#' as R attributes, and thus do not get saved when using XGBoost's own serializaters like
|
||||
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#' @param label vector of response values. Should not be provided when data is
|
||||
#' a local data file name or an \code{xgb.DMatrix}.
|
||||
#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
|
||||
#' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
|
||||
#' This parameter is only used when input is a dense matrix.
|
||||
#' @param weight a vector indicating the weight for each row of the input.
|
||||
#' Note that some callbacks might try to leave attributes in the resulting model object,
|
||||
#' such as an evaluation log (a `data.table` object) - be aware that these objects are kept
|
||||
#' as R attributes, and thus do not get saved when using XGBoost's own serializaters like
|
||||
#' [xgb.save()] (but are kept when using R serializers like [saveRDS()]).
|
||||
#' @param ... other parameters to pass to `params`.
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.Booster}.
|
||||
#' @return An object of class `xgb.Booster`.
|
||||
#'
|
||||
#' @details
|
||||
#' These are the training functions for \code{xgboost}.
|
||||
#' These are the training functions for [xgboost()].
|
||||
#'
|
||||
#' The \code{xgb.train} interface supports advanced features such as \code{evals},
|
||||
#' The `xgb.train()` interface supports advanced features such as `evals`,
|
||||
#' customized objective and evaluation metric functions, therefore it is more flexible
|
||||
#' than the \code{xgboost} interface.
|
||||
#' than the [xgboost()] interface.
|
||||
#'
|
||||
#' Parallelization is automatically enabled if \code{OpenMP} is present.
|
||||
#' Number of threads can also be manually specified via the \code{nthread}
|
||||
#' parameter.
|
||||
#' Parallelization is automatically enabled if OpenMP is present.
|
||||
#' Number of threads can also be manually specified via the `nthread` parameter.
|
||||
#'
|
||||
#' While in other interfaces, the default random seed defaults to zero, in R, if a parameter `seed`
|
||||
#' is not manually supplied, it will generate a random seed through R's own random number generator,
|
||||
@ -185,64 +188,56 @@
|
||||
#' RNG from R.
|
||||
#'
|
||||
#' The evaluation metric is chosen automatically by XGBoost (according to the objective)
|
||||
#' when the \code{eval_metric} parameter is not provided.
|
||||
#' User may set one or several \code{eval_metric} parameters.
|
||||
#' when the `eval_metric` parameter is not provided.
|
||||
#' User may set one or several `eval_metric` parameters.
|
||||
#' Note that when using a customized metric, only this single metric can be used.
|
||||
#' The following is the list of built-in metrics for which XGBoost provides optimized implementation:
|
||||
#' \itemize{
|
||||
#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
#' Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' \item \code{mae} Mean absolute error
|
||||
#' \item \code{mape} Mean absolute percentage error
|
||||
#' \item{ \code{auc} Area under the curve.
|
||||
#' \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.}
|
||||
#' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
#' }
|
||||
#' - `rmse`: Root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' - `logloss`: Negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' - `mlogloss`: Multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
#' - `error`: Binary classification error rate. It is calculated as `(# wrong cases) / (# all cases)`.
|
||||
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
#' Different threshold (e.g., 0.) could be specified as `error@0`.
|
||||
#' - `merror`: Multiclass classification error rate. It is calculated as `(# wrong cases) / (# all cases)`.
|
||||
#' - `mae`: Mean absolute error.
|
||||
#' - `mape`: Mean absolute percentage error.
|
||||
#' - `auc`: Area under the curve.
|
||||
#' \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' - `aucpr`: Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
#' - `ndcg`: Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
#'
|
||||
#' The following callbacks are automatically created when certain parameters are set:
|
||||
#' \itemize{
|
||||
#' \item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
|
||||
#' and the \code{print_every_n} parameter is passed to it.
|
||||
#' \item \code{xgb.cb.evaluation.log} is on when \code{evals} is present.
|
||||
#' \item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||
#' \item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
|
||||
#' }
|
||||
#' - [xgb.cb.print.evaluation()] is turned on when `verbose > 0` and the `print_every_n`
|
||||
#' parameter is passed to it.
|
||||
#' - [xgb.cb.evaluation.log()] is on when `evals` is present.
|
||||
#' - [xgb.cb.early.stop()]: When `early_stopping_rounds` is set.
|
||||
#' - [xgb.cb.save.model()]: When `save_period > 0` is set.
|
||||
#'
|
||||
#' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
|
||||
#' from typical R objects (it's an 'altrep' list class), and it makes a separation between
|
||||
#' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
|
||||
#' and shared between interfaces through serialization functions like \link{xgb.save}; and
|
||||
#' R-specific attributes (typically the result from a callback), accessed through \link{attributes}
|
||||
#' and \link{attr}, which are otherwise
|
||||
#' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
|
||||
#' not anyhow used by functions like \link{predict.xgb.Booster}.
|
||||
#' internal booster attributes (restricted to jsonifyable data), accessed through [xgb.attr()]
|
||||
#' and shared between interfaces through serialization functions like [xgb.save()]; and
|
||||
#' R-specific attributes (typically the result from a callback), accessed through [attributes()]
|
||||
#' and [attr()], which are otherwise
|
||||
#' only used in the R interface, only kept when using R's serializers like [saveRDS()], and
|
||||
#' not anyhow used by functions like `predict.xgb.Booster()`.
|
||||
#'
|
||||
#' Be aware that one such R attribute that is automatically added is `params` - this attribute
|
||||
#' is assigned from the `params` argument to this function, and is only meant to serve as a
|
||||
#' reference for what went into the booster, but is not used in other methods that take a booster
|
||||
#' object - so for example, changing the booster's configuration requires calling `xgb.config<-`
|
||||
#' or 'xgb.parameters<-', while simply modifying `attributes(model)$params$<...>` will have no
|
||||
#' or `xgb.parameters<-`, while simply modifying `attributes(model)$params$<...>` will have no
|
||||
#' effect elsewhere.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.Callback}},
|
||||
#' \code{\link{predict.xgb.Booster}},
|
||||
#' \code{\link{xgb.cv}}
|
||||
#' @seealso [xgb.Callback()], [predict.xgb.Booster()], [xgb.cv()]
|
||||
#'
|
||||
#' @references
|
||||
#'
|
||||
#' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
|
||||
#' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
@ -257,8 +252,13 @@
|
||||
#' evals <- list(train = dtrain, eval = dtest)
|
||||
#'
|
||||
#' ## A simple xgb.train example:
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = "binary:logistic", eval_metric = "auc")
|
||||
#' param <- list(
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' objective = "binary:logistic",
|
||||
#' eval_metric = "auc"
|
||||
#' )
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
|
||||
#'
|
||||
#' ## An xgb.train example where custom objective and evaluation metric are
|
||||
@ -278,38 +278,67 @@
|
||||
#'
|
||||
#' # These functions could be used by passing them either:
|
||||
#' # as 'objective' and 'eval_metric' parameters in the params list:
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = logregobj, eval_metric = evalerror)
|
||||
#' param <- list(
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' objective = logregobj,
|
||||
#' eval_metric = evalerror
|
||||
#' )
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
|
||||
#'
|
||||
#' # or through the ... arguments:
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
|
||||
#' objective = logregobj, eval_metric = evalerror)
|
||||
#' bst <- xgb.train(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' nrounds = 2,
|
||||
#' evals = evals,
|
||||
#' verbose = 0,
|
||||
#' objective = logregobj,
|
||||
#' eval_metric = evalerror
|
||||
#' )
|
||||
#'
|
||||
#' # or as dedicated 'obj' and 'feval' parameters of xgb.train:
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals,
|
||||
#' obj = logregobj, feval = evalerror)
|
||||
#' bst <- xgb.train(
|
||||
#' param, dtrain, nrounds = 2, evals = evals, obj = logregobj, feval = evalerror
|
||||
#' )
|
||||
#'
|
||||
#'
|
||||
#' ## An xgb.train example of using variable learning rates at each iteration:
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = "binary:logistic", eval_metric = "auc")
|
||||
#' param <- list(
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' objective = "binary:logistic",
|
||||
#' eval_metric = "auc"
|
||||
#' )
|
||||
#' my_etas <- list(eta = c(0.5, 0.1))
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
|
||||
#' callbacks = list(xgb.cb.reset.parameters(my_etas)))
|
||||
#'
|
||||
#' bst <- xgb.train(
|
||||
#' param,
|
||||
#' dtrain,
|
||||
#' nrounds = 2,
|
||||
#' evals = evals,
|
||||
#' verbose = 0,
|
||||
#' callbacks = list(xgb.cb.reset.parameters(my_etas))
|
||||
#' )
|
||||
#'
|
||||
#' ## Early stopping:
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
|
||||
#' early_stopping_rounds = 3)
|
||||
#' bst <- xgb.train(
|
||||
#' param, dtrain, nrounds = 25, evals = evals, early_stopping_rounds = 3
|
||||
#' )
|
||||
#'
|
||||
#' ## An 'xgboost' interface example:
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
||||
#' max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
|
||||
#' objective = "binary:logistic")
|
||||
#' bst <- xgboost(
|
||||
#' x = agaricus.train$data,
|
||||
#' y = factor(agaricus.train$label),
|
||||
#' params = list(max_depth = 2, eta = 1),
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2
|
||||
#' )
|
||||
#' pred <- predict(bst, agaricus.test$data)
|
||||
#'
|
||||
#' @rdname xgb.train
|
||||
#' @export
|
||||
xgb.train <- function(params = list(), data, nrounds, evals = list(),
|
||||
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
66
R-package/config.h.in
Normal file
66
R-package/config.h.in
Normal file
@ -0,0 +1,66 @@
|
||||
/* config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* Define if building universal (internal helper macro) */
|
||||
#undef AC_APPLE_UNIVERSAL_BUILD
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#undef HAVE_STDINT_H
|
||||
|
||||
/* Define to 1 if you have the <stdio.h> header file. */
|
||||
#undef HAVE_STDIO_H
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#undef HAVE_STDLIB_H
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#undef HAVE_STRINGS_H
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#undef HAVE_STRING_H
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#undef HAVE_SYS_STAT_H
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#undef HAVE_SYS_TYPES_H
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#undef HAVE_UNISTD_H
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#undef PACKAGE_BUGREPORT
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#undef PACKAGE_NAME
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#undef PACKAGE_STRING
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
/* Define to 1 if all of the C90 standard headers exist (not just the ones
|
||||
required in a freestanding environment). This macro is provided for
|
||||
backward compatibility; new code need not use it. */
|
||||
#undef STDC_HEADERS
|
||||
|
||||
/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
|
||||
significant byte first (like Motorola and SPARC, unlike Intel). */
|
||||
#if defined AC_APPLE_UNIVERSAL_BUILD
|
||||
# if defined __BIG_ENDIAN__
|
||||
# define WORDS_BIGENDIAN 1
|
||||
# endif
|
||||
#else
|
||||
# ifndef WORDS_BIGENDIAN
|
||||
# undef WORDS_BIGENDIAN
|
||||
# endif
|
||||
#endif
|
||||
578
R-package/configure
vendored
578
R-package/configure
vendored
@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for xgboost 2.1.0.
|
||||
# Generated by GNU Autoconf 2.71 for xgboost 2.2.0.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@ -607,17 +607,50 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='xgboost'
|
||||
PACKAGE_TARNAME='xgboost'
|
||||
PACKAGE_VERSION='2.1.0'
|
||||
PACKAGE_STRING='xgboost 2.1.0'
|
||||
PACKAGE_VERSION='2.2.0'
|
||||
PACKAGE_STRING='xgboost 2.2.0'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
# Factoring default headers for most tests.
|
||||
ac_includes_default="\
|
||||
#include <stddef.h>
|
||||
#ifdef HAVE_STDIO_H
|
||||
# include <stdio.h>
|
||||
#endif
|
||||
#ifdef HAVE_STDLIB_H
|
||||
# include <stdlib.h>
|
||||
#endif
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#endif
|
||||
#ifdef HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#endif
|
||||
#ifdef HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
#endif
|
||||
#ifdef HAVE_STRINGS_H
|
||||
# include <strings.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
# include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
# include <sys/stat.h>
|
||||
#endif
|
||||
#ifdef HAVE_UNISTD_H
|
||||
# include <unistd.h>
|
||||
#endif"
|
||||
|
||||
ac_header_cxx_list=
|
||||
ac_subst_vars='LTLIBOBJS
|
||||
LIBOBJS
|
||||
BACKTRACE_LIB
|
||||
ENDIAN_FLAG
|
||||
OPENMP_LIB
|
||||
OPENMP_CXXFLAGS
|
||||
USE_LITTLE_ENDIAN
|
||||
OBJEXT
|
||||
EXEEXT
|
||||
ac_ct_CXX
|
||||
@ -676,7 +709,8 @@ CXXFLAGS
|
||||
LDFLAGS
|
||||
LIBS
|
||||
CPPFLAGS
|
||||
CCC'
|
||||
CCC
|
||||
USE_LITTLE_ENDIAN'
|
||||
|
||||
|
||||
# Initialize some variables set by options.
|
||||
@ -1225,7 +1259,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures xgboost 2.1.0 to adapt to many kinds of systems.
|
||||
\`configure' configures xgboost 2.2.0 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@ -1287,7 +1321,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of xgboost 2.1.0:";;
|
||||
short | recursive ) echo "Configuration of xgboost 2.2.0:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@ -1299,6 +1333,9 @@ Some influential environment variables:
|
||||
LIBS libraries to pass to the linker, e.g. -l<library>
|
||||
CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
|
||||
you have headers in a nonstandard directory <include dir>
|
||||
USE_LITTLE_ENDIAN
|
||||
"Whether to build with little endian (checks at compile time if
|
||||
unset)"
|
||||
|
||||
Use these variables to override the choices made by `configure' or to help
|
||||
it to find libraries and programs with nonstandard names/locations.
|
||||
@ -1367,7 +1404,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
xgboost configure 2.1.0
|
||||
xgboost configure 2.2.0
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@ -1509,6 +1546,39 @@ fi
|
||||
as_fn_set_status $ac_retval
|
||||
|
||||
} # ac_fn_cxx_try_run
|
||||
|
||||
# ac_fn_cxx_check_header_compile LINENO HEADER VAR INCLUDES
|
||||
# ---------------------------------------------------------
|
||||
# Tests whether HEADER exists and can be compiled using the include files in
|
||||
# INCLUDES, setting the cache variable VAR accordingly.
|
||||
ac_fn_cxx_check_header_compile ()
|
||||
{
|
||||
as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
|
||||
printf %s "checking for $2... " >&6; }
|
||||
if eval test \${$3+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
$4
|
||||
#include <$2>
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
eval "$3=yes"
|
||||
else $as_nop
|
||||
eval "$3=no"
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
eval ac_res=\$$3
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
|
||||
printf "%s\n" "$ac_res" >&6; }
|
||||
eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
|
||||
|
||||
} # ac_fn_cxx_check_header_compile
|
||||
ac_configure_args_raw=
|
||||
for ac_arg
|
||||
do
|
||||
@ -1533,7 +1603,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by xgboost $as_me 2.1.0, which was
|
||||
It was created by xgboost $as_me 2.2.0, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@ -2020,6 +2090,15 @@ main (int argc, char **argv)
|
||||
}
|
||||
"
|
||||
|
||||
as_fn_append ac_header_cxx_list " stdio.h stdio_h HAVE_STDIO_H"
|
||||
as_fn_append ac_header_cxx_list " stdlib.h stdlib_h HAVE_STDLIB_H"
|
||||
as_fn_append ac_header_cxx_list " string.h string_h HAVE_STRING_H"
|
||||
as_fn_append ac_header_cxx_list " inttypes.h inttypes_h HAVE_INTTYPES_H"
|
||||
as_fn_append ac_header_cxx_list " stdint.h stdint_h HAVE_STDINT_H"
|
||||
as_fn_append ac_header_cxx_list " strings.h strings_h HAVE_STRINGS_H"
|
||||
as_fn_append ac_header_cxx_list " sys/stat.h sys_stat_h HAVE_SYS_STAT_H"
|
||||
as_fn_append ac_header_cxx_list " sys/types.h sys_types_h HAVE_SYS_TYPES_H"
|
||||
as_fn_append ac_header_cxx_list " unistd.h unistd_h HAVE_UNISTD_H"
|
||||
# Check that the precious variables saved in the cache have kept the same
|
||||
# value.
|
||||
ac_cache_corrupted=false
|
||||
@ -2792,38 +2871,289 @@ fi
|
||||
|
||||
|
||||
### Endian detection
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking endian" >&5
|
||||
printf %s "checking endian... " >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: " >&5
|
||||
printf "%s\n" "" >&6; }
|
||||
if test "$cross_compiling" = yes
|
||||
|
||||
ac_header= ac_cache=
|
||||
for ac_item in $ac_header_cxx_list
|
||||
do
|
||||
if test $ac_cache; then
|
||||
ac_fn_cxx_check_header_compile "$LINENO" $ac_header ac_cv_header_$ac_cache "$ac_includes_default"
|
||||
if eval test \"x\$ac_cv_header_$ac_cache\" = xyes; then
|
||||
printf "%s\n" "#define $ac_item 1" >> confdefs.h
|
||||
fi
|
||||
ac_header= ac_cache=
|
||||
elif test $ac_header; then
|
||||
ac_cache=$ac_item
|
||||
else
|
||||
ac_header=$ac_item
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if test $ac_cv_header_stdlib_h = yes && test $ac_cv_header_string_h = yes
|
||||
then :
|
||||
{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
|
||||
printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;}
|
||||
as_fn_error $? "cannot run test program while cross compiling
|
||||
See \`config.log' for more details" "$LINENO" 5; }
|
||||
|
||||
printf "%s\n" "#define STDC_HEADERS 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
if test -z "${USE_LITTLE_ENDIAN+x}"
|
||||
then :
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: Checking system endianness as USE_LITTLE_ENDIAN is unset" >&5
|
||||
printf "%s\n" "$as_me: Checking system endianness as USE_LITTLE_ENDIAN is unset" >&6;}
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking system endianness" >&5
|
||||
printf %s "checking system endianness... " >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
|
||||
printf %s "checking whether byte ordering is bigendian... " >&6; }
|
||||
if test ${ac_cv_c_bigendian+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
ac_cv_c_bigendian=unknown
|
||||
# See if we're dealing with a universal compiler.
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <stdint.h>
|
||||
#ifndef __APPLE_CC__
|
||||
not a universal capable compiler
|
||||
#endif
|
||||
typedef int dummy;
|
||||
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
|
||||
# Check for potential -arch flags. It is not universal unless
|
||||
# there are at least two -arch flags with different values.
|
||||
ac_arch=
|
||||
ac_prev=
|
||||
for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do
|
||||
if test -n "$ac_prev"; then
|
||||
case $ac_word in
|
||||
i?86 | x86_64 | ppc | ppc64)
|
||||
if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then
|
||||
ac_arch=$ac_word
|
||||
else
|
||||
ac_cv_c_bigendian=universal
|
||||
break
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
ac_prev=
|
||||
elif test "x$ac_word" = "x-arch"; then
|
||||
ac_prev=arch
|
||||
fi
|
||||
done
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
if test $ac_cv_c_bigendian = unknown; then
|
||||
# See if sys/param.h defines the BYTE_ORDER macro.
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
const uint16_t endianness = 256; return !!(*(const uint8_t *)&endianness);
|
||||
#if ! (defined BYTE_ORDER && defined BIG_ENDIAN \
|
||||
&& defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \
|
||||
&& LITTLE_ENDIAN)
|
||||
bogus endian macros
|
||||
#endif
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
# It does; now see whether it defined to BIG_ENDIAN or not.
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
#if BYTE_ORDER != BIG_ENDIAN
|
||||
not big endian
|
||||
#endif
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
ac_cv_c_bigendian=yes
|
||||
else $as_nop
|
||||
ac_cv_c_bigendian=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
if test $ac_cv_c_bigendian = unknown; then
|
||||
# See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris).
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <limits.h>
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
#if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN)
|
||||
bogus endian macros
|
||||
#endif
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
# It does; now see whether it defined to _BIG_ENDIAN or not.
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <limits.h>
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
#ifndef _BIG_ENDIAN
|
||||
not big endian
|
||||
#endif
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
ac_cv_c_bigendian=yes
|
||||
else $as_nop
|
||||
ac_cv_c_bigendian=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
if test $ac_cv_c_bigendian = unknown; then
|
||||
# Compile a test program.
|
||||
if test "$cross_compiling" = yes
|
||||
then :
|
||||
# Try to guess by grepping values from an object file.
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
unsigned short int ascii_mm[] =
|
||||
{ 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
|
||||
unsigned short int ascii_ii[] =
|
||||
{ 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
|
||||
int use_ascii (int i) {
|
||||
return ascii_mm[i] + ascii_ii[i];
|
||||
}
|
||||
unsigned short int ebcdic_ii[] =
|
||||
{ 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
|
||||
unsigned short int ebcdic_mm[] =
|
||||
{ 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
|
||||
int use_ebcdic (int i) {
|
||||
return ebcdic_mm[i] + ebcdic_ii[i];
|
||||
}
|
||||
extern int foo;
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
return use_ascii (foo) == use_ebcdic (foo);
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"
|
||||
then :
|
||||
if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then
|
||||
ac_cv_c_bigendian=yes
|
||||
fi
|
||||
if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
|
||||
if test "$ac_cv_c_bigendian" = unknown; then
|
||||
ac_cv_c_bigendian=no
|
||||
else
|
||||
# finding both strings is unlikely to happen, but who knows?
|
||||
ac_cv_c_bigendian=unknown
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
else $as_nop
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
$ac_includes_default
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
|
||||
/* Are we little or big endian? From Harbison&Steele. */
|
||||
union
|
||||
{
|
||||
long int l;
|
||||
char c[sizeof (long int)];
|
||||
} u;
|
||||
u.l = 1;
|
||||
return u.c[sizeof (long int) - 1] == 1;
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_run "$LINENO"
|
||||
then :
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=1"
|
||||
ac_cv_c_bigendian=no
|
||||
else $as_nop
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=0"
|
||||
ac_cv_c_bigendian=yes
|
||||
fi
|
||||
rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
|
||||
conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
|
||||
fi
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5
|
||||
printf "%s\n" "$ac_cv_c_bigendian" >&6; }
|
||||
case $ac_cv_c_bigendian in #(
|
||||
yes)
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: using big endian" >&5
|
||||
printf "%s\n" "using big endian" >&6; }
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=0";; #(
|
||||
no)
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: using little endian" >&5
|
||||
printf "%s\n" "using little endian" >&6; }
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=1" ;; #(
|
||||
universal)
|
||||
|
||||
printf "%s\n" "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
|
||||
|
||||
;; #(
|
||||
*)
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unknown" >&5
|
||||
printf "%s\n" "unknown" >&6; }
|
||||
as_fn_error $? "Could not determine endianness. Please set USE_LITTLE_ENDIAN" "$LINENO" 5
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
else $as_nop
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: Forcing endianness to: ${USE_LITTLE_ENDIAN}" >&5
|
||||
printf "%s\n" "$as_me: Forcing endianness to: ${USE_LITTLE_ENDIAN}" >&6;}
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=${USE_LITTLE_ENDIAN}"
|
||||
|
||||
fi
|
||||
|
||||
OPENMP_CXXFLAGS=""
|
||||
|
||||
@ -2877,6 +3207,8 @@ fi
|
||||
|
||||
ac_config_files="$ac_config_files src/Makevars"
|
||||
|
||||
ac_config_headers="$ac_config_headers config.h"
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
# tests run on this system so they can be shared between configure
|
||||
@ -2967,43 +3299,7 @@ test "x$prefix" = xNONE && prefix=$ac_default_prefix
|
||||
# Let make expand exec_prefix.
|
||||
test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
|
||||
|
||||
# Transform confdefs.h into DEFS.
|
||||
# Protect against shell expansion while executing Makefile rules.
|
||||
# Protect against Makefile macro expansion.
|
||||
#
|
||||
# If the first sed substitution is executed (which looks for macros that
|
||||
# take arguments), then branch to the quote section. Otherwise,
|
||||
# look for a macro that doesn't take arguments.
|
||||
ac_script='
|
||||
:mline
|
||||
/\\$/{
|
||||
N
|
||||
s,\\\n,,
|
||||
b mline
|
||||
}
|
||||
t clear
|
||||
:clear
|
||||
s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g
|
||||
t quote
|
||||
s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g
|
||||
t quote
|
||||
b any
|
||||
:quote
|
||||
s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g
|
||||
s/\[/\\&/g
|
||||
s/\]/\\&/g
|
||||
s/\$/$$/g
|
||||
H
|
||||
:any
|
||||
${
|
||||
g
|
||||
s/^\n//
|
||||
s/\n/ /g
|
||||
p
|
||||
}
|
||||
'
|
||||
DEFS=`sed -n "$ac_script" confdefs.h`
|
||||
|
||||
DEFS=-DHAVE_CONFIG_H
|
||||
|
||||
ac_libobjs=
|
||||
ac_ltlibobjs=
|
||||
@ -3023,6 +3319,7 @@ LTLIBOBJS=$ac_ltlibobjs
|
||||
|
||||
|
||||
|
||||
|
||||
: "${CONFIG_STATUS=./config.status}"
|
||||
ac_write_fail=0
|
||||
ac_clean_files_save=$ac_clean_files
|
||||
@ -3412,7 +3709,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by xgboost $as_me 2.1.0, which was
|
||||
This file was extended by xgboost $as_me 2.2.0, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@ -3430,11 +3727,15 @@ case $ac_config_files in *"
|
||||
"*) set x $ac_config_files; shift; ac_config_files=$*;;
|
||||
esac
|
||||
|
||||
case $ac_config_headers in *"
|
||||
"*) set x $ac_config_headers; shift; ac_config_headers=$*;;
|
||||
esac
|
||||
|
||||
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
# Files that config.status was made for.
|
||||
config_files="$ac_config_files"
|
||||
config_headers="$ac_config_headers"
|
||||
|
||||
_ACEOF
|
||||
|
||||
@ -3455,10 +3756,15 @@ Usage: $0 [OPTION]... [TAG]...
|
||||
--recheck update $as_me by reconfiguring in the same conditions
|
||||
--file=FILE[:TEMPLATE]
|
||||
instantiate the configuration file FILE
|
||||
--header=FILE[:TEMPLATE]
|
||||
instantiate the configuration header FILE
|
||||
|
||||
Configuration files:
|
||||
$config_files
|
||||
|
||||
Configuration headers:
|
||||
$config_headers
|
||||
|
||||
Report bugs to the package provider."
|
||||
|
||||
_ACEOF
|
||||
@ -3467,7 +3773,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
xgboost config.status 2.1.0
|
||||
xgboost config.status 2.2.0
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
@ -3521,7 +3827,18 @@ do
|
||||
esac
|
||||
as_fn_append CONFIG_FILES " '$ac_optarg'"
|
||||
ac_need_defaults=false;;
|
||||
--he | --h | --help | --hel | -h )
|
||||
--header | --heade | --head | --hea )
|
||||
$ac_shift
|
||||
case $ac_optarg in
|
||||
*\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
|
||||
esac
|
||||
as_fn_append CONFIG_HEADERS " '$ac_optarg'"
|
||||
ac_need_defaults=false;;
|
||||
--he | --h)
|
||||
# Conflict between --help and --header
|
||||
as_fn_error $? "ambiguous option: \`$1'
|
||||
Try \`$0 --help' for more information.";;
|
||||
--help | --hel | -h )
|
||||
printf "%s\n" "$ac_cs_usage"; exit ;;
|
||||
-q | -quiet | --quiet | --quie | --qui | --qu | --q \
|
||||
| -silent | --silent | --silen | --sile | --sil | --si | --s)
|
||||
@ -3578,6 +3895,7 @@ for ac_config_target in $ac_config_targets
|
||||
do
|
||||
case $ac_config_target in
|
||||
"src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;;
|
||||
"config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
|
||||
|
||||
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
|
||||
esac
|
||||
@ -3590,6 +3908,7 @@ done
|
||||
# bizarre bug on SunOS 4.1.3.
|
||||
if $ac_need_defaults; then
|
||||
test ${CONFIG_FILES+y} || CONFIG_FILES=$config_files
|
||||
test ${CONFIG_HEADERS+y} || CONFIG_HEADERS=$config_headers
|
||||
fi
|
||||
|
||||
# Have a temporary directory for convenience. Make it in the build tree
|
||||
@ -3777,8 +4096,116 @@ fi
|
||||
cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
fi # test -n "$CONFIG_FILES"
|
||||
|
||||
# Set up the scripts for CONFIG_HEADERS section.
|
||||
# No need to generate them if there are no CONFIG_HEADERS.
|
||||
# This happens for instance with `./config.status Makefile'.
|
||||
if test -n "$CONFIG_HEADERS"; then
|
||||
cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
|
||||
BEGIN {
|
||||
_ACEOF
|
||||
|
||||
eval set X " :F $CONFIG_FILES "
|
||||
# Transform confdefs.h into an awk script `defines.awk', embedded as
|
||||
# here-document in config.status, that substitutes the proper values into
|
||||
# config.h.in to produce config.h.
|
||||
|
||||
# Create a delimiter string that does not exist in confdefs.h, to ease
|
||||
# handling of long lines.
|
||||
ac_delim='%!_!# '
|
||||
for ac_last_try in false false :; do
|
||||
ac_tt=`sed -n "/$ac_delim/p" confdefs.h`
|
||||
if test -z "$ac_tt"; then
|
||||
break
|
||||
elif $ac_last_try; then
|
||||
as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
|
||||
else
|
||||
ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
|
||||
fi
|
||||
done
|
||||
|
||||
# For the awk script, D is an array of macro values keyed by name,
|
||||
# likewise P contains macro parameters if any. Preserve backslash
|
||||
# newline sequences.
|
||||
|
||||
ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
|
||||
sed -n '
|
||||
s/.\{148\}/&'"$ac_delim"'/g
|
||||
t rset
|
||||
:rset
|
||||
s/^[ ]*#[ ]*define[ ][ ]*/ /
|
||||
t def
|
||||
d
|
||||
:def
|
||||
s/\\$//
|
||||
t bsnl
|
||||
s/["\\]/\\&/g
|
||||
s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\
|
||||
D["\1"]=" \3"/p
|
||||
s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2"/p
|
||||
d
|
||||
:bsnl
|
||||
s/["\\]/\\&/g
|
||||
s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\
|
||||
D["\1"]=" \3\\\\\\n"\\/p
|
||||
t cont
|
||||
s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p
|
||||
t cont
|
||||
d
|
||||
:cont
|
||||
n
|
||||
s/.\{148\}/&'"$ac_delim"'/g
|
||||
t clear
|
||||
:clear
|
||||
s/\\$//
|
||||
t bsnlc
|
||||
s/["\\]/\\&/g; s/^/"/; s/$/"/p
|
||||
d
|
||||
:bsnlc
|
||||
s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p
|
||||
b cont
|
||||
' <confdefs.h | sed '
|
||||
s/'"$ac_delim"'/"\\\
|
||||
"/g' >>$CONFIG_STATUS || ac_write_fail=1
|
||||
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
for (key in D) D_is_set[key] = 1
|
||||
FS = ""
|
||||
}
|
||||
/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ {
|
||||
line = \$ 0
|
||||
split(line, arg, " ")
|
||||
if (arg[1] == "#") {
|
||||
defundef = arg[2]
|
||||
mac1 = arg[3]
|
||||
} else {
|
||||
defundef = substr(arg[1], 2)
|
||||
mac1 = arg[2]
|
||||
}
|
||||
split(mac1, mac2, "(") #)
|
||||
macro = mac2[1]
|
||||
prefix = substr(line, 1, index(line, defundef) - 1)
|
||||
if (D_is_set[macro]) {
|
||||
# Preserve the white space surrounding the "#".
|
||||
print prefix "define", macro P[macro] D[macro]
|
||||
next
|
||||
} else {
|
||||
# Replace #undef with comments. This is necessary, for example,
|
||||
# in the case of _POSIX_SOURCE, which is predefined and required
|
||||
# on some systems where configure will not decide to define it.
|
||||
if (defundef == "undef") {
|
||||
print "/*", prefix defundef, macro, "*/"
|
||||
next
|
||||
}
|
||||
}
|
||||
}
|
||||
{ print }
|
||||
_ACAWK
|
||||
_ACEOF
|
||||
cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
|
||||
fi # test -n "$CONFIG_HEADERS"
|
||||
|
||||
|
||||
eval set X " :F $CONFIG_FILES :H $CONFIG_HEADERS "
|
||||
shift
|
||||
for ac_tag
|
||||
do
|
||||
@ -3986,7 +4413,30 @@ which seems to be undefined. Please make sure it is defined" >&2;}
|
||||
esac \
|
||||
|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
|
||||
;;
|
||||
|
||||
:H)
|
||||
#
|
||||
# CONFIG_HEADER
|
||||
#
|
||||
if test x"$ac_file" != x-; then
|
||||
{
|
||||
printf "%s\n" "/* $configure_input */" >&1 \
|
||||
&& eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
|
||||
} >"$ac_tmp/config.h" \
|
||||
|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
|
||||
if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
|
||||
printf "%s\n" "$as_me: $ac_file is unchanged" >&6;}
|
||||
else
|
||||
rm -f "$ac_file"
|
||||
mv "$ac_tmp/config.h" "$ac_file" \
|
||||
|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
|
||||
fi
|
||||
else
|
||||
printf "%s\n" "/* $configure_input */" >&1 \
|
||||
&& eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
|
||||
|| as_fn_error $? "could not create -" "$LINENO" 5
|
||||
fi
|
||||
;;
|
||||
|
||||
|
||||
esac
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
AC_PREREQ(2.69)
|
||||
|
||||
AC_INIT([xgboost],[2.1.0],[],[xgboost],[])
|
||||
AC_INIT([xgboost],[2.2.0],[],[xgboost],[])
|
||||
|
||||
: ${R_HOME=`R RHOME`}
|
||||
if test -z "${R_HOME}"; then
|
||||
@ -28,11 +28,22 @@ AC_MSG_RESULT([])
|
||||
AC_CHECK_LIB([execinfo], [backtrace], [BACKTRACE_LIB=-lexecinfo], [BACKTRACE_LIB=''])
|
||||
|
||||
### Endian detection
|
||||
AC_MSG_CHECKING([endian])
|
||||
AC_MSG_RESULT([])
|
||||
AC_RUN_IFELSE([AC_LANG_PROGRAM([[#include <stdint.h>]], [[const uint16_t endianness = 256; return !!(*(const uint8_t *)&endianness);]])],
|
||||
[ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=1"],
|
||||
[ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=0"])
|
||||
AC_ARG_VAR(USE_LITTLE_ENDIAN, "Whether to build with little endian (checks at compile time if unset)")
|
||||
AS_IF([test -z "${USE_LITTLE_ENDIAN+x}"], [
|
||||
AC_MSG_NOTICE([Checking system endianness as USE_LITTLE_ENDIAN is unset])
|
||||
AC_MSG_CHECKING([system endianness])
|
||||
AC_C_BIGENDIAN(
|
||||
[AC_MSG_RESULT([using big endian])
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=0"],
|
||||
[AC_MSG_RESULT([using little endian])
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=1"],
|
||||
[AC_MSG_RESULT([unknown])
|
||||
AC_MSG_ERROR([Could not determine endianness. Please set USE_LITTLE_ENDIAN])]
|
||||
)
|
||||
], [
|
||||
AC_MSG_NOTICE([Forcing endianness to: ${USE_LITTLE_ENDIAN}])
|
||||
ENDIAN_FLAG="-DDMLC_CMAKE_LITTLE_ENDIAN=${USE_LITTLE_ENDIAN}"
|
||||
])
|
||||
|
||||
OPENMP_CXXFLAGS=""
|
||||
|
||||
@ -73,4 +84,5 @@ AC_SUBST(OPENMP_LIB)
|
||||
AC_SUBST(ENDIAN_FLAG)
|
||||
AC_SUBST(BACKTRACE_LIB)
|
||||
AC_CONFIG_FILES([src/Makevars])
|
||||
AC_CONFIG_HEADERS([config.h])
|
||||
AC_OUTPUT
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
basic_walkthrough Basic feature walkthrough
|
||||
custom_objective Customize loss function, and evaluation metric
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
create_sparse_matrix Create Sparse Matrix
|
||||
predict_leaf_indices Predicting the corresponding leaves
|
||||
early_stopping Early Stop in training
|
||||
poisson_regression Poisson regression on count data
|
||||
tweedie_regression Tweedie regression
|
||||
gpu_accelerated GPU-accelerated tree building algorithms
|
||||
interaction_constraints Interaction constraints among features
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||
* [Customize loss function, and evaluation metric](custom_objective.R)
|
||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
* [Generalized Linear Model](generalized_linear_model.R)
|
||||
* [Cross validation](cross_validation.R)
|
||||
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
|
||||
* [Use GPU-accelerated tree building algorithms](gpu_accelerated.R)
|
||||
|
||||
Benchmarks
|
||||
====
|
||||
* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs)
|
||||
|
||||
Notes
|
||||
====
|
||||
* Contribution of examples, benchmarks is more than welcomed!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request :)
|
||||
@ -1,114 +0,0 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
# we load in the agaricus dataset
|
||||
# In this example, we are aiming to predict whether a mushroom is edible
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
|
||||
class(train$label)
|
||||
class(train$data)
|
||||
|
||||
#-------------Basic Training using XGBoost-----------------
|
||||
# this is the basic usage of xgboost you can put matrix in data field
|
||||
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
|
||||
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
|
||||
print("Training xgboost with sparseMatrix")
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||
print("Training xgboost with Matrix")
|
||||
bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
|
||||
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
|
||||
print("Training xgboost with xgb.DMatrix")
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# Verbose = 0,1,2
|
||||
print("Train xgboost with verbose 0, no message")
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic", verbose = 0)
|
||||
print("Train xgboost with verbose 1, print evaluation metric")
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic", verbose = 1)
|
||||
print("Train xgboost with verbose 2, also print information about tree")
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
|
||||
nthread = 2, objective = "binary:logistic", verbose = 2)
|
||||
|
||||
# you can also specify data as file path to a LIBSVM format input
|
||||
# since we do not have this file with us, the following line is just for illustration
|
||||
# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic")
|
||||
|
||||
#--------------------basic prediction using xgboost--------------
|
||||
# you can do prediction using the following line
|
||||
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
|
||||
pred <- predict(bst, test$data)
|
||||
err <- mean(as.numeric(pred > 0.5) != test$label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
#-------------------save and load models-------------------------
|
||||
# save model to binary local file
|
||||
xgb.save(bst, "xgboost.model")
|
||||
# load binary model to R
|
||||
# Function doesn't take 'nthreads', but can be set like this:
|
||||
RhpcBLASctl::omp_set_num_threads(1)
|
||||
bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
|
||||
|
||||
# save model to R's raw vector
|
||||
raw <- xgb.save.raw(bst)
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load.raw(raw)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
# pred3 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
|
||||
|
||||
#----------------Advanced features --------------
|
||||
# to use advanced features, we need to put data in xgb.DMatrix
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label = test$label)
|
||||
#---------------Using an evaluation set----------------
|
||||
# 'evals' is a list of xgb.DMatrix, each of them is tagged with name
|
||||
evals <- list(train = dtrain, test = dtest)
|
||||
# to train with an evaluation set, use xgb.train, which contains more advanced features
|
||||
# 'evals' argument allows us to monitor the evaluation result on all data in the list
|
||||
print("Train xgboost using xgb.train with evaluation data")
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# we can change evaluation metrics, or use multiple evaluation metrics
|
||||
print("train xgboost using xgb.train with evaluation data, watch logloss and error")
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
eval_metric = "error", eval_metric = "logloss",
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
|
||||
# xgb.DMatrix can also be saved using xgb.DMatrix.save
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
# to load it in, simply call xgb.DMatrix
|
||||
dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# information can be extracted from xgb.DMatrix using getinfo
|
||||
label <- getinfo(dtest, "label")
|
||||
pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
# You can dump the tree you learned using xgb.dump into a text file
|
||||
dump_path <- file.path(tempdir(), 'dump.raw.txt')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
|
||||
# Finally, you can check which features are the most important.
|
||||
print("Most important features (look at column Gain):")
|
||||
imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst)
|
||||
print(imp_matrix)
|
||||
|
||||
# Feature importance bar plot by gain
|
||||
print("Feature importance Plot : ")
|
||||
print(xgb.plot.importance(importance_matrix = imp_matrix))
|
||||
@ -1,26 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print('start running example to start from a initial prediction')
|
||||
# train xgboost for 1 round
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
bst <- xgb.train(param, dtrain, 1, evals)
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
|
||||
ptrain <- predict(bst, dtrain, outputmargin = TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin = TRUE)
|
||||
# set the base_margin property of dtrain and dtest
|
||||
# base margin is the base prediction we will boost from
|
||||
setinfo(dtrain, "base_margin", ptrain)
|
||||
setinfo(dtest, "base_margin", ptest)
|
||||
|
||||
print('this is result of boost from initial prediction')
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = 1, evals = evals)
|
||||
@ -1,117 +0,0 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
if (!require(vcd)) {
|
||||
install.packages('vcd') #Available in CRAN. Used for its dataset with categorical values.
|
||||
require(vcd)
|
||||
}
|
||||
# According to its documentation, XGBoost works only on numbers.
|
||||
# Sometimes the dataset we have to work on have categorical data.
|
||||
# A categorical variable is one which have a fixed number of values.
|
||||
# By example, if for each observation a variable called "Colour" can have only
|
||||
# "red", "blue" or "green" as value, it is a categorical variable.
|
||||
#
|
||||
# In R, categorical variable is called Factor.
|
||||
# Type ?factor in console for more information.
|
||||
#
|
||||
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix
|
||||
# before analyzing it in XGBoost.
|
||||
# The method we are going to see is usually called "one hot encoding".
|
||||
|
||||
#load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
|
||||
# create a copy of the dataset with data.table package
|
||||
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
|
||||
# and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's have a look to the data.table
|
||||
cat("Print the dataset\n")
|
||||
print(df)
|
||||
|
||||
# 2 columns have factor type, one has ordinal type
|
||||
# (ordinal variable is a categorical variable with values which can be ordered, here: None > Some > Marked).
|
||||
cat("Structure of the dataset\n")
|
||||
str(df)
|
||||
|
||||
# Let's add some new categorical features to see if it helps.
|
||||
# Of course these feature are highly correlated to the Age feature.
|
||||
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
|
||||
# even in case of highly correlated features.
|
||||
|
||||
# For the first feature we create groups of age by rounding the real age.
|
||||
# Note that we transform it to factor (categorical data) so the algorithm treat them as independent values.
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
|
||||
# I choose this value based on nothing.
|
||||
# We will see later if simplifying the information based on arbitrary values is a good strategy
|
||||
# (I am sure you already have an idea of how well it will work!).
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[, ID := NULL]
|
||||
|
||||
# List the different values for the column Treatment: Placebo, Treated.
|
||||
cat("Values of the categorical feature Treatment\n")
|
||||
print(levels(df[, Treatment]))
|
||||
|
||||
# Next step, we will transform the categorical data to dummy variables.
|
||||
# This method is also called one hot encoding.
|
||||
# The purpose is to transform each value of each categorical feature in one binary feature.
|
||||
#
|
||||
# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated.
|
||||
# Each of them will be binary.
|
||||
# For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation,
|
||||
# the value 1 in the new column Placebo and the value 0 in the new column Treated.
|
||||
#
|
||||
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
|
||||
# Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df)
|
||||
|
||||
cat("Encoding of the sparse Matrix\n")
|
||||
print(sparse_matrix)
|
||||
|
||||
# Create the output vector (not sparse)
|
||||
# 1. Set, for all rows, field in Y column to 0;
|
||||
# 2. set Y to 1 when Improved == Marked;
|
||||
# 3. Return Y column
|
||||
output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
bst <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = output_vector), max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||
|
||||
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
|
||||
print(importance)
|
||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age.
|
||||
# The second most important feature is having received a placebo or not.
|
||||
# The sex is third.
|
||||
# Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||
|
||||
# Does these result make sense?
|
||||
# Let's check some Chi2 between each of these features and the outcome.
|
||||
|
||||
print(chisq.test(df$Age, df$Y))
|
||||
# Pearson correlation between Age and illness disappearing is 35
|
||||
|
||||
print(chisq.test(df$AgeDiscret, df$Y))
|
||||
# Our first simplification of Age gives a Pearson correlation of 8.
|
||||
|
||||
print(chisq.test(df$AgeCat, df$Y))
|
||||
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2.
|
||||
# It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that),
|
||||
# but for the illness we are studying, the age to be vulnerable is not the same.
|
||||
# Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
|
||||
|
||||
# As you can see, in general destroying information by simplifying it won't improve your model.
|
||||
# Chi2 just demonstrates that.
|
||||
# But in more complex cases, creating a new feature based on existing one which makes link with the outcome
|
||||
# more obvious may help the algorithm and improve the model.
|
||||
# The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
||||
# However it's almost always worse when you add some arbitrary rules.
|
||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with
|
||||
# other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
|
||||
# Linear model may not be that strong in these scenario.
|
||||
@ -1,51 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nrounds <- 2
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = 'error')
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5,
|
||||
metrics = 'error', showsd = FALSE)
|
||||
|
||||
###
|
||||
# you can also do cross validation with customized loss function
|
||||
# See custom_objective.R
|
||||
##
|
||||
print('running cross validation, with customized loss function')
|
||||
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth = 2, eta = 1,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
# train with customized objective
|
||||
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)
|
||||
|
||||
# do cross validation with prediction values for each fold
|
||||
res <- xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5, prediction = TRUE)
|
||||
res$evaluation_log
|
||||
length(res$pred)
|
||||
@ -1,65 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is log likelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make builtin evaluation metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the builtin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
print('start training with user customized objective')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
|
||||
#
|
||||
# there can be cases where you want additional information
|
||||
# being considered besides the property of DMatrix you can get by getinfo
|
||||
# you can set additional information as attributes if DMatrix
|
||||
|
||||
# set label attribute of dtrain to be label, we use label as an example, it can be anything
|
||||
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
|
||||
# this is new customized objective, where you can access things you set
|
||||
# same thing applies to customized evaluation function
|
||||
logregobjattr <- function(preds, dtrain) {
|
||||
# now you can access the attribute in customized function
|
||||
labels <- attr(dtrain, 'label')
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobjattr, eval_metric = evalerror)
|
||||
print('start training with user customized objective, with additional attributes in DMatrix')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
@ -1,40 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0)
|
||||
evals <- list(eval = dtest)
|
||||
num_round <- 20
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is log likelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make builtin evaluation metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the builtin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
print('start training with early Stopping setting')
|
||||
|
||||
bst <- xgb.train(param, dtrain, num_round, evals,
|
||||
objective = logregobj, eval_metric = evalerror, maximize = FALSE,
|
||||
early_stopping_round = 3)
|
||||
bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
|
||||
objective = logregobj, eval_metric = evalerror,
|
||||
maximize = FALSE, early_stopping_rounds = 3)
|
||||
@ -1,33 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
# you can fit a linear regression, or logistic regression model
|
||||
##
|
||||
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param <- list(objective = "binary:logistic", booster = "gblinear",
|
||||
nthread = 2, alpha = 0.0001, lambda = 1)
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
ypred <- predict(bst, dtest)
|
||||
labels <- getinfo(dtest, 'label')
|
||||
cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n')
|
||||
@ -1,45 +0,0 @@
|
||||
# An example of using GPU-accelerated tree building algorithms
|
||||
#
|
||||
# NOTE: it can only run if you have a CUDA-enable GPU and the package was
|
||||
# specially compiled with GPU support.
|
||||
#
|
||||
# For the current functionality, see
|
||||
# https://xgboost.readthedocs.io/en/latest/gpu/index.html
|
||||
#
|
||||
|
||||
library('xgboost')
|
||||
|
||||
# Simulate N x p random matrix with some binomial response dependent on pp columns
|
||||
set.seed(111)
|
||||
N <- 1000000
|
||||
p <- 50
|
||||
pp <- 25
|
||||
X <- matrix(runif(N * p), ncol = p)
|
||||
betas <- 2 * runif(pp) - 1
|
||||
sel <- sort(sample(p, pp))
|
||||
m <- X[, sel] %*% betas - 1 + rnorm(N)
|
||||
y <- rbinom(N, 1, plogis(m))
|
||||
|
||||
tr <- sample.int(N, N * 0.75)
|
||||
dtrain <- xgb.DMatrix(X[tr, ], label = y[tr])
|
||||
dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr])
|
||||
evals <- list(train = dtrain, test = dtest)
|
||||
|
||||
# An example of running 'gpu_hist' algorithm
|
||||
# which is
|
||||
# - similar to the 'hist'
|
||||
# - the fastest option for moderately large datasets
|
||||
# - current limitations: max_depth < 16, does not implement guided loss
|
||||
# You can use tree_method = 'gpu_hist' for another GPU accelerated algorithm,
|
||||
# which is slower, more memory-hungry, but does not use binning.
|
||||
param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
|
||||
max_bin = 64, tree_method = 'gpu_hist')
|
||||
pt <- proc.time()
|
||||
bst_gpu <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
|
||||
proc.time() - pt
|
||||
|
||||
# Compare to the 'hist' algorithm:
|
||||
param$tree_method <- 'hist'
|
||||
pt <- proc.time()
|
||||
bst_hist <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
|
||||
proc.time() - pt
|
||||
@ -1,113 +0,0 @@
|
||||
library(xgboost)
|
||||
library(data.table)
|
||||
|
||||
set.seed(1024)
|
||||
|
||||
# Function to obtain a list of interactions fitted in trees, requires input of maximum depth
|
||||
treeInteractions <- function(input_tree, input_max_depth) {
|
||||
ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable"
|
||||
|
||||
trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting
|
||||
if (input_max_depth < 2) return(list()) # no interactions if max depth < 2
|
||||
if (nrow(input_tree) == 1) return(list())
|
||||
|
||||
# Attach parent nodes
|
||||
for (i in 2:input_max_depth) {
|
||||
if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))]
|
||||
parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)]
|
||||
parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)]
|
||||
|
||||
data.table::setorderv(trees, 'ID_merge')
|
||||
data.table::setorderv(parents_left, 'ID_merge')
|
||||
data.table::setorderv(parents_right, 'ID_merge')
|
||||
|
||||
trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
|
||||
trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
}
|
||||
|
||||
# Extract nodes with interactions
|
||||
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), # nolint: object_usage_linter
|
||||
c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))),
|
||||
with = FALSE]
|
||||
interaction_trees_split <- split(interaction_trees, seq_len(nrow(interaction_trees)))
|
||||
interaction_list <- lapply(interaction_trees_split, as.character)
|
||||
|
||||
# Remove NAs (no parent interaction)
|
||||
interaction_list <- lapply(interaction_list, function(x) x[!is.na(x)])
|
||||
|
||||
# Remove non-interactions (same variable)
|
||||
interaction_list <- lapply(interaction_list, unique) # remove same variables
|
||||
interaction_length <- lengths(interaction_list)
|
||||
interaction_list <- interaction_list[interaction_length > 1]
|
||||
interaction_list <- unique(lapply(interaction_list, sort))
|
||||
return(interaction_list)
|
||||
}
|
||||
|
||||
# Generate sample data
|
||||
x <- list()
|
||||
for (i in 1:10) {
|
||||
x[[i]] <- i * rnorm(1000, 10)
|
||||
}
|
||||
x <- as.data.table(x)
|
||||
|
||||
y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']]
|
||||
+ rnorm(1000, 0.001) + 3 * sin(x[['V7']])
|
||||
|
||||
train <- as.matrix(x)
|
||||
|
||||
# Interaction constraint list (column names form)
|
||||
interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5'))
|
||||
|
||||
# Convert interaction constraint list into feature index form
|
||||
cols2ids <- function(object, col_names) {
|
||||
LUT <- seq_along(col_names) - 1
|
||||
names(LUT) <- col_names
|
||||
rapply(object, function(x) LUT[x], classes = "character", how = "replace")
|
||||
}
|
||||
interaction_list_fid <- cols2ids(interaction_list, colnames(train))
|
||||
|
||||
# Fit model with interaction constraints
|
||||
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
|
||||
bst_tree <- xgb.model.dt.tree(colnames(train), bst)
|
||||
bst_interactions <- treeInteractions(bst_tree, 4)
|
||||
# interactions constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Fit model without interaction constraints
|
||||
bst2 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
|
||||
bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
|
||||
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions
|
||||
|
||||
# Fit model with both interaction and monotonicity constraints
|
||||
bst3 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
|
||||
|
||||
bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
|
||||
bst3_interactions <- treeInteractions(bst3_tree, 4)
|
||||
# interactions still constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Show monotonic constraints still apply by checking scores after incrementing V1
|
||||
x1 <- sort(unique(x[['V1']]))
|
||||
for (i in seq_along(x1)){
|
||||
testdata <- copy(x[, - ('V1')])
|
||||
testdata[['V1']] <- x1[i]
|
||||
testdata <- testdata[, paste0('V', 1:10), with = FALSE]
|
||||
pred <- predict(bst3, as.matrix(testdata))
|
||||
|
||||
# Should not print out anything due to monotonic constraints
|
||||
if (i > 1) if (any(pred > prev_pred)) print(i)
|
||||
prev_pred <- pred
|
||||
}
|
||||
@ -1,6 +0,0 @@
|
||||
data(mtcars)
|
||||
head(mtcars)
|
||||
bst <- xgb.train(data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]),
|
||||
objective = 'count:poisson', nrounds = 5)
|
||||
pred <- predict(bst, as.matrix(mtcars[, -11]))
|
||||
sqrt(mean((pred - mtcars[, 11]) ^ 2))
|
||||
@ -1,23 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
nrounds <- 2
|
||||
|
||||
# training the model for two rounds
|
||||
bst <- xgb.train(param, dtrain, nrounds, nthread = 2, evals = evals)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
labels <- getinfo(dtest, 'label')
|
||||
|
||||
### predict using first 1 tree
|
||||
ypred1 <- predict(bst, dtest, iterationrange = c(1, 1))
|
||||
# by default, we predict using all the trees
|
||||
ypred2 <- predict(bst, dtest)
|
||||
|
||||
cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n')
|
||||
cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n')
|
||||
@ -1,54 +0,0 @@
|
||||
require(xgboost)
|
||||
require(data.table)
|
||||
require(Matrix)
|
||||
|
||||
set.seed(1982)
|
||||
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
nrounds <- 4
|
||||
|
||||
# training the model for two rounds
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# by default, we predict using all the trees
|
||||
pred_with_leaf <- predict(bst, dtest, predleaf = TRUE)
|
||||
head(pred_with_leaf)
|
||||
|
||||
create.new.tree.features <- function(model, original.features) {
|
||||
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||
cols <- list()
|
||||
for (i in 1:xgb.get.num.boosted.rounds(model)) {
|
||||
# max is not the real max but it s not important for the purpose of adding features
|
||||
leaf.id <- sort(unique(pred_with_leaf[, i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)
|
||||
}
|
||||
cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols)))
|
||||
}
|
||||
|
||||
# Convert previous features to one hot encoding
|
||||
new.features.train <- create.new.tree.features(bst, agaricus.train$data)
|
||||
new.features.test <- create.new.tree.features(bst, agaricus.test$data)
|
||||
colnames(new.features.test) <- colnames(new.features.train)
|
||||
|
||||
# learning with new features
|
||||
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy with new features
|
||||
accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# Here the accuracy was already good and is now perfect.
|
||||
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
|
||||
accuracy.after, "!\n"))
|
||||
@ -1,13 +0,0 @@
|
||||
# running all scripts in demo folder, removed during packaging.
|
||||
demo(basic_walkthrough, package = 'xgboost')
|
||||
demo(custom_objective, package = 'xgboost')
|
||||
demo(boost_from_prediction, package = 'xgboost')
|
||||
demo(predict_first_ntree, package = 'xgboost')
|
||||
demo(generalized_linear_model, package = 'xgboost')
|
||||
demo(cross_validation, package = 'xgboost')
|
||||
demo(create_sparse_matrix, package = 'xgboost')
|
||||
demo(predict_leaf_indices, package = 'xgboost')
|
||||
demo(early_stopping, package = 'xgboost')
|
||||
demo(poisson_regression, package = 'xgboost')
|
||||
demo(tweedie_regression, package = 'xgboost')
|
||||
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support
|
||||
@ -1,49 +0,0 @@
|
||||
library(xgboost)
|
||||
library(data.table)
|
||||
library(cplm)
|
||||
|
||||
data(AutoClaim)
|
||||
|
||||
# auto insurance dataset analyzed by Yip and Yau (2005)
|
||||
dt <- data.table(AutoClaim)
|
||||
|
||||
# exclude these columns from the model matrix
|
||||
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
|
||||
|
||||
# retains the missing values
|
||||
# NOTE: this dataset is comes ready out of the box
|
||||
options(na.action = 'na.pass')
|
||||
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE])
|
||||
options(na.action = 'na.omit')
|
||||
|
||||
# response
|
||||
y <- dt[, CLM_AMT5]
|
||||
|
||||
d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
|
||||
|
||||
# the tweedie_variance_power parameter determines the shape of
|
||||
# distribution
|
||||
# - closer to 1 is more poisson like and the mass
|
||||
# is more concentrated near zero
|
||||
# - closer to 2 is more gamma like and the mass spreads to the
|
||||
# the right with less concentration near zero
|
||||
|
||||
params <- list(
|
||||
objective = 'reg:tweedie',
|
||||
eval_metric = 'rmse',
|
||||
tweedie_variance_power = 1.4,
|
||||
max_depth = 6,
|
||||
eta = 1)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = d_train,
|
||||
params = params,
|
||||
maximize = FALSE,
|
||||
evals = list(train = d_train),
|
||||
nrounds = 20)
|
||||
|
||||
var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
|
||||
|
||||
preds <- predict(bst, d_train)
|
||||
|
||||
rmse <- sqrt(sum(mean((y - preds) ^ 2)))
|
||||
@ -5,66 +5,88 @@
|
||||
\title{Model Serialization and Compatibility}
|
||||
\description{
|
||||
When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost R model, but XGBoost also provides
|
||||
its own serializers with better compatibility guarantees, which allow loading
|
||||
said models in other language bindings of XGBoost.
|
||||
|
||||
Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{
|
||||
\item Additional model configuration (accessible through \link{xgb.config}),
|
||||
which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
|
||||
Note that an \code{xgb.Booster} object (\strong{as produced by \code{\link[=xgb.train]{xgb.train()}}}, see rest of the doc
|
||||
for objects produced by \code{\link[=xgboost]{xgboost()}}), outside of its core components, might also keep:
|
||||
\itemize{
|
||||
\item Additional model configuration (accessible through \code{\link[=xgb.config]{xgb.config()}}), which includes
|
||||
model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
|
||||
These are not necessarily useful for prediction/importance/plotting.
|
||||
\item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log}
|
||||
if present.
|
||||
\item Additional R specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
which are kept as a \code{data.table} object, accessible through
|
||||
\code{attributes(model)$evaluation_log} if present.
|
||||
}
|
||||
|
||||
The first one (configurations) does not have the same compatibility guarantees as
|
||||
the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
the model itself, including attributes that are set and accessed through
|
||||
\code{\link[=xgb.attributes]{xgb.attributes()}} - that is, such configuration might be lost after loading the
|
||||
booster in a different XGBoost version, regardless of the serializer that was used.
|
||||
These are saved when using \code{\link[=saveRDS]{saveRDS()}}, but will be discarded if loaded into an
|
||||
incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
serializers from its public interface including \code{\link[=xgb.save]{xgb.save()}} and \code{\link[=xgb.save.raw]{xgb.save.raw()}}.
|
||||
|
||||
The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
The second ones (R attributes) are not part of the standard XGBoost model structure,
|
||||
and thus are not saved when using XGBoost's own serializers. These attributes are
|
||||
only used for informational purposes, such as keeping track of evaluation metrics as
|
||||
the model was fit, or saving the R call that produced the model, but are otherwise
|
||||
not used for prediction / importance / plotting / etc.
|
||||
These R attributes are only preserved when using R's serializers.
|
||||
|
||||
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models
|
||||
before version \verb{2.1.0}; have a very different R object structure and are incompatible with
|
||||
each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before
|
||||
version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that
|
||||
the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
In addition to the regular \code{xgb.Booster} objects producted by \code{\link[=xgb.train]{xgb.train()}}, the
|
||||
function \code{\link[=xgboost]{xgboost()}} produces a different subclass \code{xgboost}, which keeps other
|
||||
additional metadata as R attributes such as class names in classification problems,
|
||||
and which has a dedicated \code{predict} method that uses different defaults. XGBoost's
|
||||
own serializers can work with this \code{xgboost} class, but as they do not keep R
|
||||
attributes, the resulting object, when deserialized, is downcasted to the regular
|
||||
\code{xgb.Booster} class (i.e. it loses the metadata, and the resulting object will use
|
||||
\code{predict.xgb.Booster} instead of \code{predict.xgboost}) - for these \code{xgboost} objects,
|
||||
\code{saveRDS} might thus be a better option if the extra functionalities are needed.
|
||||
|
||||
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and
|
||||
XGBoost models before version \verb{2.1.0}; have a very different R object structure and
|
||||
are incompatible with each other. Hence, models that were saved with R serializers
|
||||
like \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}} before version \verb{2.1.0} will not work with latter
|
||||
\code{xgboost} versions and vice versa. Be aware that the structure of R model objects
|
||||
could in theory change again in the future, so XGBoost's serializers
|
||||
should be preferred for long-term storage.
|
||||
|
||||
Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or
|
||||
higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
Furthermore, note that using the package \code{qs} for serialization will require
|
||||
version 0.26 or higher of said package, and will have the same compatibility
|
||||
restrictions as R serializers.
|
||||
}
|
||||
\details{
|
||||
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
Use \code{\link[=xgb.save]{xgb.save()}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
the JSON format by specifying the JSON extension. To read the model back, use
|
||||
\code{\link{xgb.load}}.
|
||||
\code{\link[=xgb.load]{xgb.load()}}.
|
||||
|
||||
Use \code{\link{xgb.save.raw}} to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
Use \code{\link[=xgb.save.raw]{xgb.save.raw()}} to save the XGBoost model as a sequence (vector) of raw bytes
|
||||
in a future-proof manner. Future releases of XGBoost will be able to read the raw bytes and
|
||||
re-construct the corresponding model. To read the model back, use \code{\link{xgb.load.raw}}.
|
||||
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
re-construct the corresponding model. To read the model back, use \code{\link[=xgb.load.raw]{xgb.load.raw()}}.
|
||||
The \code{\link[=xgb.save.raw]{xgb.save.raw()}} function is useful if you would like to persist the XGBoost model
|
||||
as part of another R object.
|
||||
|
||||
Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
control as it relies on R's serialization format (see e.g. the details section in
|
||||
\link{serialize} and \link{save} from base R).
|
||||
Use \code{\link[=saveRDS]{saveRDS()}} if you require the R-specific attributes that a booster might have, such
|
||||
as evaluation logs or the model class \code{xgboost} instead of \code{xgb.Booster}, but note that
|
||||
future compatibility of such objects is outside XGBoost's control as it relies on R's
|
||||
serialization format (see e.g. the details section in \link{serialize} and \code{\link[=save]{save()}} from base R).
|
||||
|
||||
For more details and explanation about model persistence and archival, consult the page
|
||||
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
|
||||
objective = "binary:logistic")
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
# Save as a stand-alone file; load it with xgb.load()
|
||||
fname <- file.path(tempdir(), "xgb_model.ubj")
|
||||
|
||||
@ -16,11 +16,10 @@ This data set is originally from the Mushroom data set,
|
||||
UCI Machine Learning Repository.
|
||||
}
|
||||
\details{
|
||||
This data set includes the following fields:
|
||||
|
||||
It includes the following fields:
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
\item \code{label}: The label for each record.
|
||||
\item \code{data}: A sparse Matrix of 'dgCMatrix' class with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -16,11 +16,10 @@ This data set is originally from the Mushroom data set,
|
||||
UCI Machine Learning Repository.
|
||||
}
|
||||
\details{
|
||||
This data set includes the following fields:
|
||||
|
||||
It includes the following fields:
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
\item \code{label}: The label for each record.
|
||||
\item \code{data}: A sparse Matrix of 'dgCMatrix' class with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -12,11 +12,12 @@
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The extracted coefficients:\itemize{
|
||||
\item If there's only one coefficient per column in the data, will be returned as a
|
||||
The extracted coefficients:
|
||||
\itemize{
|
||||
\item If there is only one coefficient per column in the data, will be returned as a
|
||||
vector, potentially containing the feature names if available, with the intercept
|
||||
as first column.
|
||||
\item If there's more than one coefficient per column in the data (e.g. when using
|
||||
\item If there is more than one coefficient per column in the data (e.g. when using
|
||||
\code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal
|
||||
to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column
|
||||
(classes in multi-class classification) dimension will not be named.
|
||||
@ -33,16 +34,19 @@ coefficients as used by \link{predict.xgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
Extracts the coefficients from a 'gblinear' booster object,
|
||||
as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}.
|
||||
as produced by \code{\link[=xgb.train]{xgb.train()}} when using parameter \code{booster="gblinear"}.
|
||||
|
||||
Note: this function will error out if passing a booster model
|
||||
which is not of "gblinear" type.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
|
||||
data(mtcars)
|
||||
|
||||
y <- mtcars[, 1]
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
|
||||
dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
|
||||
params <- list(booster = "gblinear", nthread = 1)
|
||||
model <- xgb.train(data = dm, params = params, nrounds = 2)
|
||||
|
||||
@ -13,13 +13,14 @@
|
||||
Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
|
||||
}
|
||||
\details{
|
||||
Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
|
||||
Note: since \code{\link[=nrow]{nrow()}} and \code{\link[=ncol]{ncol()}} internally use \code{\link[=dim]{dim()}}, they can also
|
||||
be directly used with an \code{xgb.DMatrix} object.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
train <- agaricus.train
|
||||
dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
||||
|
||||
stopifnot(nrow(dtrain) == nrow(train$data))
|
||||
stopifnot(ncol(dtrain) == ncol(train$data))
|
||||
|
||||
@ -10,26 +10,27 @@
|
||||
\method{dimnames}{xgb.DMatrix}(x) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{object of class \code{xgb.DMatrix}}
|
||||
\item{x}{Object of class \code{xgb.DMatrix}.}
|
||||
|
||||
\item{value}{a list of two elements: the first one is ignored
|
||||
\item{value}{A list of two elements: the first one is ignored
|
||||
and the second one is column names}
|
||||
}
|
||||
\description{
|
||||
Only column names are supported for \code{xgb.DMatrix}, thus setting of
|
||||
row names would have no effect and returned row names would be NULL.
|
||||
row names would have no effect and returned row names would be \code{NULL}.
|
||||
}
|
||||
\details{
|
||||
Generic \code{dimnames} methods are used by \code{colnames}.
|
||||
Since row names are irrelevant, it is recommended to use \code{colnames} directly.
|
||||
Generic \code{\link[=dimnames]{dimnames()}} methods are used by \code{\link[=colnames]{colnames()}}.
|
||||
Since row names are irrelevant, it is recommended to use \code{\link[=colnames]{colnames()}} directly.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
train <- agaricus.train
|
||||
dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
||||
dimnames(dtrain)
|
||||
colnames(dtrain)
|
||||
colnames(dtrain) <- make.names(1:ncol(train$data))
|
||||
print(dtrain, verbose=TRUE)
|
||||
print(dtrain, verbose = TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -22,34 +22,34 @@ setinfo(object, name, info)
|
||||
\method{setinfo}{xgb.DMatrix}(object, name, info)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.DMatrix} of \code{xgb.Booster}.}
|
||||
\item{object}{Object of class \code{xgb.DMatrix} or \code{xgb.Booster}.}
|
||||
|
||||
\item{name}{the name of the information field to get (see details)}
|
||||
\item{name}{The name of the information field to get (see details).}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
\item{info}{The specific field of information to set.}
|
||||
}
|
||||
\value{
|
||||
For \code{getinfo}, will return the requested field. For \code{setinfo}, will always return value \code{TRUE}
|
||||
if it succeeds.
|
||||
For \code{getinfo()}, will return the requested field. For \code{setinfo()},
|
||||
will always return value \code{TRUE} if it succeeds.
|
||||
}
|
||||
\description{
|
||||
Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
}
|
||||
\details{
|
||||
The \code{name} field can be one of the following for \code{xgb.DMatrix}:
|
||||
|
||||
\itemize{
|
||||
\item \code{label}
|
||||
\item \code{weight}
|
||||
\item \code{base_margin}
|
||||
\item \code{label_lower_bound}
|
||||
\item \code{label_upper_bound}
|
||||
\item \code{group}
|
||||
\item \code{feature_type}
|
||||
\item \code{feature_name}
|
||||
\item \code{nrow}
|
||||
\item label
|
||||
\item weight
|
||||
\item base_margin
|
||||
\item label_lower_bound
|
||||
\item label_upper_bound
|
||||
\item group
|
||||
\item feature_type
|
||||
\item feature_name
|
||||
\item nrow
|
||||
}
|
||||
See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
|
||||
See the documentation for \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} for more information about these fields.
|
||||
|
||||
For \code{xgb.Booster}, can be one of the following:
|
||||
\itemize{
|
||||
@ -57,17 +57,18 @@ For \code{xgb.Booster}, can be one of the following:
|
||||
\item \code{feature_name}
|
||||
}
|
||||
|
||||
Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
Note that, while 'qid' cannot be retrieved, it is possible to get the equivalent 'group'
|
||||
for a DMatrix that had 'qid' assigned.
|
||||
|
||||
\bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See
|
||||
\link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
\strong{Important}: when calling \code{\link[=setinfo]{setinfo()}}, the objects are modified in-place. See
|
||||
\code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an idea of this in-place assignment works.
|
||||
|
||||
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
See the documentation for \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} for possible fields that can be set
|
||||
(which correspond to arguments in that function).
|
||||
|
||||
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
but \bold{aren't} allowed here:\itemize{
|
||||
but \strong{are not} allowed here:
|
||||
\itemize{
|
||||
\item data
|
||||
\item missing
|
||||
\item silent
|
||||
@ -75,19 +76,22 @@ but \bold{aren't} allowed here:\itemize{
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels <- getinfo(dtrain, "label")
|
||||
setinfo(dtrain, "label", 1 - labels)
|
||||
|
||||
labels2 <- getinfo(dtrain, "label")
|
||||
stopifnot(all(labels2 == 1 - labels))
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all(labels2 == 1-labels))
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all.equal(labels2, 1-labels))
|
||||
labels <- getinfo(dtrain, "label")
|
||||
setinfo(dtrain, "label", 1 - labels)
|
||||
|
||||
labels2 <- getinfo(dtrain, "label")
|
||||
stopifnot(all.equal(labels2, 1 - labels))
|
||||
}
|
||||
|
||||
@ -13,10 +13,10 @@
|
||||
predcontrib = FALSE,
|
||||
approxcontrib = FALSE,
|
||||
predinteraction = FALSE,
|
||||
reshape = FALSE,
|
||||
training = FALSE,
|
||||
iterationrange = NULL,
|
||||
strict_shape = FALSE,
|
||||
avoid_transpose = FALSE,
|
||||
validate_features = FALSE,
|
||||
base_margin = NULL,
|
||||
...
|
||||
@ -28,35 +28,36 @@
|
||||
\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
|
||||
local data file, or \code{xgb.DMatrix}.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ For single-row predictions on sparse data, it's recommended to use CSR format. If passing
|
||||
a sparse vector, it will take it as a row vector.
|
||||
For single-row predictions on sparse data, it is recommended to use CSR format. If passing
|
||||
a sparse vector, it will take it as a row vector.
|
||||
|
||||
Note that, for repeated predictions on the same data, one might want to create a DMatrix to
|
||||
pass here instead of passing R types like matrices or data frames, as predictions will be
|
||||
faster on DMatrix.
|
||||
Note that, for repeated predictions on the same data, one might want to create a DMatrix to
|
||||
pass here instead of passing R types like matrices or data frames, as predictions will be
|
||||
faster on DMatrix.
|
||||
|
||||
If `newdata` is a `data.frame`, be aware that:\\itemize\{
|
||||
\\item Columns will be converted to numeric if they aren't already, which could potentially make
|
||||
the operation slower than in an equivalent `matrix` object.
|
||||
\\item The order of the columns must match with that of the data from which the model was fitted
|
||||
(i.e. columns will not be referenced by their names, just by their order in the data).
|
||||
\\item If the model was fitted to data with categorical columns, these columns must be of
|
||||
`factor` type here, and must use the same encoding (i.e. have the same levels).
|
||||
\\item If `newdata` contains any `factor` columns, they will be converted to base-0
|
||||
encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
|
||||
under a column which during training had a different type.
|
||||
\}
|
||||
}\if{html}{\out{</div>}}}
|
||||
If \code{newdata} is a \code{data.frame}, be aware that:
|
||||
\itemize{
|
||||
\item Columns will be converted to numeric if they aren't already, which could potentially make
|
||||
the operation slower than in an equivalent \code{matrix} object.
|
||||
\item The order of the columns must match with that of the data from which the model was fitted
|
||||
(i.e. columns will not be referenced by their names, just by their order in the data).
|
||||
\item If the model was fitted to data with categorical columns, these columns must be of
|
||||
\code{factor} type here, and must use the same encoding (i.e. have the same levels).
|
||||
\item If \code{newdata} contains any \code{factor} columns, they will be converted to base-0
|
||||
encoding (same as during DMatrix creation) - hence, one should not pass a \code{factor}
|
||||
under a column which during training had a different type.
|
||||
}}
|
||||
|
||||
\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
|
||||
\item{missing}{Float value that represents missing values in data
|
||||
(e.g., 0 or some other extreme value).
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
|
||||
this as an argument to the DMatrix constructor instead.
|
||||
}\if{html}{\out{</div>}}}
|
||||
This parameter is not used when \code{newdata} is an \code{xgb.DMatrix} - in such cases,
|
||||
should pass this as an argument to the DMatrix constructor instead.}
|
||||
|
||||
\item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
|
||||
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
|
||||
logistic regression would return log-odds instead of probabilities.}
|
||||
\item{outputmargin}{Whether the prediction should be returned in the form of
|
||||
original untransformed sum of predictions from boosting iterations' results.
|
||||
E.g., setting \code{outputmargin = TRUE} for logistic regression would return log-odds
|
||||
instead of probabilities.}
|
||||
|
||||
\item{predleaf}{Whether to predict per-tree leaf indices.}
|
||||
|
||||
@ -66,10 +67,6 @@ logistic regression would return log-odds instead of probabilities.}
|
||||
|
||||
\item{predinteraction}{Whether to return contributions of feature interactions to individual predictions (see Details).}
|
||||
|
||||
\item{reshape}{Whether to reshape the vector of predictions to matrix form when there are several
|
||||
prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
|
||||
or \code{predinteraction} is \code{TRUE}.}
|
||||
|
||||
\item{training}{Whether the prediction result is used for training. For dart booster,
|
||||
training predicting will perform dropout.}
|
||||
|
||||
@ -77,74 +74,103 @@ training predicting will perform dropout.}
|
||||
a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
|
||||
base-1 indexing, and inclusive of both ends).
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
|
||||
predict using only the first one.
|
||||
For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will
|
||||
predict using only the first one.
|
||||
|
||||
If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
|
||||
of the iterations (rounds) otherwise.
|
||||
If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all
|
||||
of the iterations (rounds) otherwise.
|
||||
|
||||
If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
|
||||
}\if{html}{\out{</div>}}}
|
||||
If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.}
|
||||
|
||||
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
|
||||
type and shape of predictions are invariant to the model type.}
|
||||
\item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode
|
||||
regardless of the model type - meaning that, for example, both a multi-class and a binary classification
|
||||
model would generate output arrays with the same number of dimensions, with the 'class' dimension having
|
||||
size equal to '1' for the binary model.
|
||||
|
||||
\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
|
||||
match (only applicable when both \code{object} and \code{newdata} have feature names).
|
||||
If passing \code{FALSE} (the default), dimensions will be simplified according to the model type, so that a
|
||||
binary classification model for example would not have a redundant dimension for 'class'.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
|
||||
the columns in `newdata` to match with the booster's.
|
||||
See documentation for the return type for the exact shape of the output arrays for each prediction mode.}
|
||||
|
||||
If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
|
||||
will additionally verify that categorical columns are of the correct type in `newdata`,
|
||||
throwing an error if they do not match.
|
||||
\item{avoid_transpose}{Whether to output the resulting predictions in the same memory layout in which they
|
||||
are generated by the core XGBoost library, without transposing them to match the expected output shape.
|
||||
|
||||
If passing `FALSE`, it is assumed that the feature names and types are the same,
|
||||
and come in the same order as in the training data.
|
||||
Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
|
||||
order, hence the result needs to be transposed in order to have the expected shape when represented as
|
||||
an R array or matrix, which might be a slow operation.
|
||||
|
||||
Note that this check might add some sizable latency to the predictions, so it's
|
||||
recommended to disable it for performance-sensitive applications.
|
||||
}\if{html}{\out{</div>}}}
|
||||
If passing \code{TRUE}, then the result will have dimensions in reverse order - for example, rows
|
||||
will be the last dimensions instead of the first dimension.}
|
||||
|
||||
\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's
|
||||
feature_names match (only applicable when both \code{object} and \code{newdata} have feature names).
|
||||
|
||||
If the column names differ and \code{newdata} is not an \code{xgb.DMatrix}, will try to reorder
|
||||
the columns in \code{newdata} to match with the booster's.
|
||||
|
||||
If the booster has feature types and \code{newdata} is either an \code{xgb.DMatrix} or
|
||||
\code{data.frame}, will additionally verify that categorical columns are of the
|
||||
correct type in \code{newdata}, throwing an error if they do not match.
|
||||
|
||||
If passing \code{FALSE}, it is assumed that the feature names and types are the same,
|
||||
and come in the same order as in the training data.
|
||||
|
||||
Note that this check might add some sizable latency to the predictions, so it's
|
||||
recommended to disable it for performance-sensitive applications.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
|
||||
be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
|
||||
an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
|
||||
}\if{html}{\out{</div>}}}
|
||||
Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will
|
||||
be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
|
||||
an argument in its constructor, or by calling \code{\link[=setinfo.xgb.DMatrix]{setinfo.xgb.DMatrix()}}.}
|
||||
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The return type depends on \code{strict_shape}. If \code{FALSE} (default):
|
||||
\itemize{
|
||||
\item For regression or binary classification: A vector of length \code{nrows(newdata)}.
|
||||
\item For multiclass classification: A vector of length \code{num_class * nrows(newdata)} or
|
||||
a \verb{(nrows(newdata), num_class)} matrix, depending on the \code{reshape} value.
|
||||
\item When \code{predleaf = TRUE}: A matrix with one column per tree.
|
||||
\item When \code{predcontrib = TRUE}: When not multiclass, a matrix with
|
||||
\code{ num_features + 1} columns. The last "+ 1" column corresponds to the baseline value.
|
||||
In the multiclass case, a list of \code{num_class} such matrices.
|
||||
The contribution values are on the scale of untransformed margin
|
||||
(e.g., for binary classification, the values are log-odds deviations from the baseline).
|
||||
\item When \code{predinteraction = TRUE}: When not multiclass, the output is a 3d array of
|
||||
dimension \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||
elements represent different feature interaction contributions. The array is symmetric WRT the last
|
||||
two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last dimension should
|
||||
produce practically the same result as \code{predcontrib = TRUE}.
|
||||
In the multiclass case, a list of \code{num_class} such arrays.
|
||||
A numeric vector or array, with corresponding dimensions depending on the prediction mode and on
|
||||
parameter \code{strict_shape} as follows:
|
||||
|
||||
If passing \code{strict_shape=FALSE}:\itemize{
|
||||
\item For regression or binary classification: a vector of length \code{nrows}.
|
||||
\item For multi-class and multi-target objectives: a matrix of dimensions \verb{[nrows, ngroups]}.
|
||||
|
||||
Note that objective variant \code{multi:softmax} defaults towards predicting most likely class (a vector
|
||||
\code{nrows}) instead of per-class probabilities.
|
||||
\item For \code{predleaf}: a matrix with one column per tree.
|
||||
|
||||
For multi-class / multi-target, they will be arranged so that columns in the output will have
|
||||
the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1},
|
||||
\code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...).
|
||||
\item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions
|
||||
\verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value.
|
||||
|
||||
For multi-class and multi-target objectives, will be an array with dimensions \verb{[nrows, ngroups, nfeats+1]}.
|
||||
|
||||
The contribution values are on the scale of untransformed margin (e.g., for binary classification,
|
||||
the values are log-odds deviations from the baseline).
|
||||
\item For \code{predinteraction}: when not multi-class / multi-target, the output is a 3D array of
|
||||
dimensions \verb{[nrows, nfeats+1, nfeats+1]}. The off-diagonal (in the last two dimensions)
|
||||
elements represent different feature interaction contributions. The array is symmetric w.r.t. the last
|
||||
two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last
|
||||
dimension should produce practically the same result as \code{predcontrib = TRUE}.
|
||||
|
||||
For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]}
|
||||
}
|
||||
|
||||
When \code{strict_shape = TRUE}, the output is always an array:
|
||||
If passing \code{strict_shape=FALSE}, the result is always an array:
|
||||
\itemize{
|
||||
\item For normal predictions, the output has dimension \verb{(num_class, nrow(newdata))}.
|
||||
\item For \code{predcontrib = TRUE}, the dimension is \verb{(ncol(newdata) + 1, num_class, nrow(newdata))}.
|
||||
\item For \code{predinteraction = TRUE}, the dimension is \verb{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}.
|
||||
\item For \code{predleaf = TRUE}, the dimension is \verb{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}.
|
||||
\item For normal predictions, the dimension is \verb{[nrows, ngroups]}.
|
||||
\item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}.
|
||||
\item For \code{predinteraction=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
|
||||
\item For \code{predleaf=TRUE}, the dimension is \verb{[nrows, niter, ngroups, num_parallel_tree]}.
|
||||
}
|
||||
|
||||
If passing \code{avoid_transpose=TRUE}, then the dimensions in all cases will be in reverse order - for
|
||||
example, for \code{predinteraction}, they will be \verb{[nfeats+1, nfeats+1, ngroups, nrows]}
|
||||
instead of \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
|
||||
}
|
||||
\description{
|
||||
Predict values on data based on xgboost model.
|
||||
Predict values on data based on XGBoost model.
|
||||
}
|
||||
\details{
|
||||
Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
|
||||
@ -211,7 +237,7 @@ str(pred_contr)
|
||||
summary(rowSums(pred_contr) - qlogis(pred))
|
||||
# for the 1st record, let's inspect its features that had non-zero contribution to prediction:
|
||||
contr1 <- pred_contr[1,]
|
||||
contr1 <- contr1[-length(contr1)] # drop BIAS
|
||||
contr1 <- contr1[-length(contr1)] # drop intercept
|
||||
contr1 <- contr1[contr1 != 0] # drop non-contributing features
|
||||
contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
|
||||
old_mar <- par("mar")
|
||||
@ -241,8 +267,6 @@ bst <- xgb.train(
|
||||
# predict for softmax returns num_class probability numbers per case:
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
str(pred)
|
||||
# reshape it to a num_class-columns matrix
|
||||
pred <- matrix(pred, ncol = num_class, byrow = TRUE)
|
||||
# convert the probabilities to softmax labels
|
||||
pred_labels <- max.col(pred) - 1
|
||||
# the following should result in the same error as seen in the last iteration
|
||||
|
||||
@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
@ -34,5 +33,4 @@ bst <- xgboost(
|
||||
attr(bst, "myattr") <- "memo"
|
||||
|
||||
print(bst)
|
||||
|
||||
}
|
||||
|
||||
@ -7,21 +7,22 @@
|
||||
\method{print}{xgb.DMatrix}(x, verbose = FALSE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{an xgb.DMatrix object}
|
||||
\item{x}{An xgb.DMatrix object.}
|
||||
|
||||
\item{verbose}{whether to print colnames (when present)}
|
||||
\item{verbose}{Whether to print colnames (when present).}
|
||||
|
||||
\item{...}{not currently used}
|
||||
\item{...}{Not currently used.}
|
||||
}
|
||||
\description{
|
||||
Print information about xgb.DMatrix.
|
||||
Currently it displays dimensions and presence of info-fields and colnames.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
dtrain
|
||||
print(dtrain, verbose=TRUE)
|
||||
|
||||
print(dtrain, verbose = TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -7,25 +7,33 @@
|
||||
\method{print}{xgb.cv.synchronous}(x, verbose = FALSE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{an \code{xgb.cv.synchronous} object}
|
||||
\item{x}{An \code{xgb.cv.synchronous} object.}
|
||||
|
||||
\item{verbose}{whether to print detailed data}
|
||||
\item{verbose}{Whether to print detailed data.}
|
||||
|
||||
\item{...}{passed to \code{data.table.print}}
|
||||
\item{...}{Passed to \code{data.table.print()}.}
|
||||
}
|
||||
\description{
|
||||
Prints formatted results of \code{xgb.cv}.
|
||||
Prints formatted results of \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\details{
|
||||
When not verbose, it would only print the evaluation results,
|
||||
including the best iteration (when available).
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
train <- agaricus.train
|
||||
cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
cv <- xgb.cv(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
nfold = 5,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
print(cv)
|
||||
print(cv, verbose=TRUE)
|
||||
print(cv, verbose = TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -13,8 +13,8 @@
|
||||
}
|
||||
\description{
|
||||
Returns the feature / variable / column names from a fitted
|
||||
booster object, which are set automatically during the call to \link{xgb.train}
|
||||
from the DMatrix names, or which can be set manually through \link{setinfo}.
|
||||
booster object, which are set automatically during the call to \code{\link[=xgb.train]{xgb.train()}}
|
||||
from the DMatrix names, or which can be set manually through \code{\link[=setinfo]{setinfo()}}.
|
||||
|
||||
If the object doesn't have feature names, will return \code{NULL}.
|
||||
|
||||
|
||||
@ -53,12 +53,12 @@ Return values of \code{NULL} will be interpreted as \code{FALSE}.}
|
||||
\item{f_after_training}{A function that will be executed after training is finished.
|
||||
|
||||
This function can optionally output something non-NULL, which will become part of the R
|
||||
attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \link{xgb.train})
|
||||
under the name supplied for parameter \code{cb_name} imn the case of \link{xgb.train}; or a part
|
||||
of the named elements in the result of \link{xgb.cv}.}
|
||||
attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \code{\link[=xgb.train]{xgb.train()}})
|
||||
under the name supplied for parameter \code{cb_name} imn the case of \code{\link[=xgb.train]{xgb.train()}}; or a part
|
||||
of the named elements in the result of \code{\link[=xgb.cv]{xgb.cv()}}.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
Constructor for defining the structure of callback functions that can be executed
|
||||
@ -66,8 +66,8 @@ at different stages of model training (before / after training, before / after e
|
||||
iteration).
|
||||
}
|
||||
\details{
|
||||
Arguments that will be passed to the supplied functions are as follows:\itemize{
|
||||
|
||||
Arguments that will be passed to the supplied functions are as follows:
|
||||
\itemize{
|
||||
\item env The same environment that is passed under argument \code{env}.
|
||||
|
||||
It may be modified by the functions in order to e.g. keep tracking of what happens
|
||||
@ -75,11 +75,10 @@ across iterations or similar.
|
||||
|
||||
This environment is only used by the functions supplied to the callback, and will
|
||||
not be kept after the model fitting function terminates (see parameter \code{f_after_training}).
|
||||
\item model The booster object when using \code{\link[=xgb.train]{xgb.train()}}, or the folds when using \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
|
||||
\item model The booster object when using \link{xgb.train}, or the folds when using
|
||||
\link{xgb.cv}.
|
||||
|
||||
For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
|
||||
For \code{\link[=xgb.cv]{xgb.cv()}}, folds are a list with a structure as follows:
|
||||
\itemize{
|
||||
\item \code{dtrain}: The training data for the fold (as an \code{xgb.DMatrix} object).
|
||||
\item \code{bst}: Rhe \code{xgb.Booster} object for the fold.
|
||||
\item \code{evals}: A list containing two DMatrices, with names \code{train} and \code{test}
|
||||
@ -88,79 +87,71 @@ For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
|
||||
from which the \code{test} entry in \code{evals} was obtained.
|
||||
}
|
||||
|
||||
This object should \bold{not} be in-place modified in ways that conflict with the
|
||||
This object should \strong{not} be in-place modified in ways that conflict with the
|
||||
training (e.g. resetting the parameters for a training update in a way that resets
|
||||
the number of rounds to zero in order to overwrite rounds).
|
||||
|
||||
Note that any R attributes that are assigned to the booster during the callback functions,
|
||||
will not be kept thereafter as the booster object variable is not re-assigned during
|
||||
training. It is however possible to set C-level attributes of the booster through
|
||||
\link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
|
||||
\code{\link[=xgb.attr]{xgb.attr()}} or \code{\link[=xgb.attributes]{xgb.attributes()}}, which should remain available for the rest
|
||||
of the iterations and after the training is done.
|
||||
|
||||
For keeping variables across iterations, it's recommended to use \code{env} instead.
|
||||
\item data The data to which the model is being fit, as an \code{xgb.DMatrix} object.
|
||||
|
||||
Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
|
||||
Note that, for \code{\link[=xgb.cv]{xgb.cv()}}, this will be the full data, while data for the specific
|
||||
folds can be found in the \code{model} object.
|
||||
\item evals The evaluation data, as passed under argument \code{evals} to \code{\link[=xgb.train]{xgb.train()}}.
|
||||
|
||||
\item evals The evaluation data, as passed under argument \code{evals} to
|
||||
\link{xgb.train}.
|
||||
|
||||
For \link{xgb.cv}, this will always be \code{NULL}.
|
||||
|
||||
\item begin_iteration Index of the first boosting iteration that will be executed
|
||||
(base-1 indexing).
|
||||
For \code{\link[=xgb.cv]{xgb.cv()}}, this will always be \code{NULL}.
|
||||
\item begin_iteration Index of the first boosting iteration that will be executed (base-1 indexing).
|
||||
|
||||
This will typically be '1', but when using training continuation, depending on the
|
||||
parameters for updates, boosting rounds will be continued from where the previous
|
||||
model ended, in which case this will be larger than 1.
|
||||
|
||||
\item end_iteration Index of the last boostign iteration that will be executed
|
||||
(base-1 indexing, inclusive of this end).
|
||||
|
||||
It should match with argument \code{nrounds} passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
It should match with argument \code{nrounds} passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
|
||||
Note that boosting might be interrupted before reaching this last iteration, for
|
||||
example by using the early stopping callback \link{xgb.cb.early.stop}.
|
||||
|
||||
example by using the early stopping callback \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}}.
|
||||
\item iteration Index of the iteration number that is being executed (first iteration
|
||||
will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on).
|
||||
|
||||
\item iter_feval Evaluation metrics for \code{evals} that were supplied, either
|
||||
determined by the objective, or by parameter \code{feval}.
|
||||
|
||||
For \link{xgb.train}, this will be a named vector with one entry per element in
|
||||
For \code{\link[=xgb.train]{xgb.train()}}, this will be a named vector with one entry per element in
|
||||
\code{evals}, where the names are determined as 'evals name' + '-' + 'metric name' - for
|
||||
example, if \code{evals} contains an entry named "tr" and the metric is "rmse",
|
||||
this will be a one-element vector with name "tr-rmse".
|
||||
|
||||
For \link{xgb.cv}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
|
||||
For \code{\link[=xgb.cv]{xgb.cv()}}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
|
||||
where the row names will follow the same naming logic as the one-dimensional vector
|
||||
that is passed in \link{xgb.train}.
|
||||
that is passed in \code{\link[=xgb.train]{xgb.train()}}.
|
||||
|
||||
Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
|
||||
this table by calculating the row-wise means and standard deviations.
|
||||
|
||||
\item final_feval The evaluation results after the last boosting round is executed
|
||||
(same format as \code{iter_feval}, and will be the exact same input as passed under
|
||||
\code{iter_feval} to the last round that is executed during model fitting).
|
||||
|
||||
\item prev_cb_res Result from a previous run of a callback sharing the same name
|
||||
(as given by parameter \code{cb_name}) when conducting training continuation, if there
|
||||
was any in the booster R attributes.
|
||||
|
||||
Some times, one might want to append the new results to the previous one, and this will
|
||||
Sometimes, one might want to append the new results to the previous one, and this will
|
||||
be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
|
||||
which will append the new rows to the previous table.
|
||||
|
||||
If no such previous callback result is available (which it never will when fitting
|
||||
a model from start instead of updating an existing model), this will be \code{NULL}.
|
||||
|
||||
For \link{xgb.cv}, which doesn't support training continuation, this will always be \code{NULL}.
|
||||
For \code{\link[=xgb.cv]{xgb.cv()}}, which doesn't support training continuation, this will always be \code{NULL}.
|
||||
}
|
||||
|
||||
The following names (\code{cb_name} values) are reserved for internal callbacks:\itemize{
|
||||
The following names (\code{cb_name} values) are reserved for internal callbacks:
|
||||
\itemize{
|
||||
\item print_evaluation
|
||||
\item evaluation_log
|
||||
\item reset_parameters
|
||||
@ -170,7 +161,8 @@ The following names (\code{cb_name} values) are reserved for internal callbacks:
|
||||
\item gblinear_history
|
||||
}
|
||||
|
||||
The following names are reserved for other non-callback attributes:\itemize{
|
||||
The following names are reserved for other non-callback attributes:
|
||||
\itemize{
|
||||
\item names
|
||||
\item class
|
||||
\item call
|
||||
@ -221,8 +213,10 @@ ssq_callback <- xgb.Callback(
|
||||
)
|
||||
|
||||
data(mtcars)
|
||||
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
@ -236,7 +230,8 @@ model <- xgb.train(
|
||||
attributes(model)$ssq
|
||||
}
|
||||
\seealso{
|
||||
Built-in callbacks:\itemize{
|
||||
Built-in callbacks:
|
||||
\itemize{
|
||||
\item \link{xgb.cb.print.evaluation}
|
||||
\item \link{xgb.cb.evaluation.log}
|
||||
\item \link{xgb.cb.reset.parameters}
|
||||
|
||||
@ -57,20 +57,20 @@ was constructed.
|
||||
|
||||
Other column types are not supported.
|
||||
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||
\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for
|
||||
\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \strong{not} supported for
|
||||
'xgb.QuantileDMatrix'.
|
||||
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
|
||||
as a single row (only when making predictions from a fitted model).
|
||||
\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
|
||||
the file, with an optional format specifier.
|
||||
|
||||
These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
|
||||
\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
|
||||
These are \strong{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
|
||||
\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}.
|
||||
\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
||||
\code{?format=libsvm} at the end of the file path. It will be the default format if not
|
||||
otherwise specified.
|
||||
\item CSV files (comma-separated values). This format can be specified by adding suffix
|
||||
\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
|
||||
\code{?format=csv} at the end ofthe file path. It will \strong{not} be auto-deduced from file extensions.
|
||||
}
|
||||
|
||||
Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
||||
@ -96,30 +96,27 @@ so it doesn't make sense to assign weights to individual data points.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
}\if{html}{\out{</div>}}}
|
||||
In the case of multi-output models, one can also pass multi-dimensional base_margin.}
|
||||
|
||||
\item{missing}{A float value to represents missing values in data (not used when creating DMatrix
|
||||
from text files).
|
||||
It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||
values in data.}
|
||||
from text files). It is useful to change when a zero, infinite, or some other
|
||||
extreme value represents missing values in data.}
|
||||
|
||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.
|
||||
\item{feature_names}{Set names for features. Overrides column names in data frame and matrix.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.
|
||||
}\if{html}{\out{</div>}}}
|
||||
Note: columns are not referenced by name when calling \code{predict}, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.}
|
||||
|
||||
\item{feature_types}{Set types for features.
|
||||
|
||||
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||
automatically from the column types.
|
||||
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied,
|
||||
feature types will be deduced automatically from the column types.
|
||||
|
||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||
with the following possible values:\itemize{
|
||||
with the following possible values:
|
||||
\itemize{
|
||||
\item "c", which represents categorical columns.
|
||||
\item "q", which represents numeric columns.
|
||||
\item "int", which represents integer columns.
|
||||
@ -130,9 +127,9 @@ Note that, while categorical types are treated differently from the rest for mod
|
||||
purposes, the other types do not influence the generated model, but have effects in other
|
||||
functionalities such as feature importances.
|
||||
|
||||
\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
|
||||
\strong{Important}: Categorical features, if specified manually through \code{feature_types}, must
|
||||
be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||
applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
|
||||
applied when passing data to \code{\link[=predict]{predict()}}. Even if passing \code{factor} types, the encoding will
|
||||
not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
@ -156,7 +153,7 @@ how the file was split beforehand. Default to row.
|
||||
This is not used when \code{data} is not a URI.}
|
||||
|
||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||
validation/test dataset with \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}. Supplying the training DMatrix
|
||||
as a reference means that the same quantisation applied to the training data is
|
||||
applied to the validation/test data}
|
||||
|
||||
@ -171,23 +168,24 @@ subclass 'xgb.QuantileDMatrix'.
|
||||
}
|
||||
\description{
|
||||
Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||
such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||
such as \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=predict]{predict()}}.
|
||||
}
|
||||
\details{
|
||||
Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||
Function \code{xgb.QuantileDMatrix()} will construct a DMatrix with quantization for the histogram
|
||||
method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||
a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||
method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the
|
||||
sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method
|
||||
(\code{tree_method = "approx"}).
|
||||
|
||||
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
Note that DMatrix objects are not serializable through R functions such as \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}}.
|
||||
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
from the original source of data.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
## Keep the number of threads to 1 for examples
|
||||
nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
@ -16,11 +16,10 @@ Checks whether an xgb.DMatrix object has a given field assigned to
|
||||
it, such as weights, labels, etc.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
x <- matrix(1:10, nrow = 5)
|
||||
dm <- xgb.DMatrix(x, nthread = 1)
|
||||
|
||||
# 'dm' so far doesn't have any fields set
|
||||
# 'dm' so far does not have any fields set
|
||||
xgb.DMatrix.hasinfo(dm, "label")
|
||||
|
||||
# Fields can be added after construction
|
||||
@ -28,5 +27,5 @@ setinfo(dm, "label", 1:5)
|
||||
xgb.DMatrix.hasinfo(dm, "label")
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DMatrix}, \link{getinfo.xgb.DMatrix}, \link{setinfo.xgb.DMatrix}
|
||||
\code{\link[=xgb.DMatrix]{xgb.DMatrix()}}, \code{\link[=getinfo.xgb.DMatrix]{getinfo.xgb.DMatrix()}}, \code{\link[=setinfo.xgb.DMatrix]{setinfo.xgb.DMatrix()}}
|
||||
}
|
||||
|
||||
@ -16,7 +16,8 @@ Save xgb.DMatrix object to binary file
|
||||
}
|
||||
\examples{
|
||||
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
xgb.DMatrix.save(dtrain, fname)
|
||||
|
||||
@ -21,16 +21,17 @@ xgb.DataBatch(
|
||||
\arguments{
|
||||
\item{data}{Batch of data belonging to this batch.
|
||||
|
||||
Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||
to pass here. Supported types are:\itemize{
|
||||
Note that not all of the input types supported by \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} are possible
|
||||
to pass here. Supported types are:
|
||||
\itemize{
|
||||
\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
|
||||
\code{integer} and \code{logical}, missing values might not be automatically recognized as
|
||||
as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix}
|
||||
as such - see the documentation for parameter \code{missing} in \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}
|
||||
for details on this.
|
||||
\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
|
||||
conversions applied to it. See the documentation for parameter \code{data} in
|
||||
\link{xgb.DMatrix} for details on it.
|
||||
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||
\code{\link[=xgb.DMatrix]{xgb.DMatrix()}} for details on it.
|
||||
\item CSR matrices, as class \code{dgRMatrix} from package "Matrix".
|
||||
}}
|
||||
|
||||
\item{label}{Label of the training data. For classification problems, should be passed encoded as
|
||||
@ -45,23 +46,21 @@ so it doesn't make sense to assign weights to individual data points.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
}\if{html}{\out{</div>}}}
|
||||
In the case of multi-output models, one can also pass multi-dimensional base_margin.}
|
||||
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.
|
||||
\item{feature_names}{Set names for features. Overrides column names in data frame and matrix.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.
|
||||
}\if{html}{\out{</div>}}}
|
||||
Note: columns are not referenced by name when calling \code{predict}, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.}
|
||||
|
||||
\item{feature_types}{Set types for features.
|
||||
|
||||
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||
automatically from the column types.
|
||||
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied,
|
||||
feature types will be deduced automatically from the column types.
|
||||
|
||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||
with the following possible values:\itemize{
|
||||
with the following possible values:
|
||||
\itemize{
|
||||
\item "c", which represents categorical columns.
|
||||
\item "q", which represents numeric columns.
|
||||
\item "int", which represents integer columns.
|
||||
@ -72,9 +71,9 @@ Note that, while categorical types are treated differently from the rest for mod
|
||||
purposes, the other types do not influence the generated model, but have effects in other
|
||||
functionalities such as feature importances.
|
||||
|
||||
\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
|
||||
\strong{Important}: Categorical features, if specified manually through \code{feature_types}, must
|
||||
be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||
applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
|
||||
applied when passing data to \code{\link[=predict]{predict()}}. Even if passing \code{factor} types, the encoding will
|
||||
not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
|
||||
|
||||
\item{group}{Group size for all ranking group.}
|
||||
@ -89,24 +88,24 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.DataBatch}, which is just a list containing the
|
||||
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
|
||||
data and parameters passed here. It does \strong{not} inherit from \code{xgb.DMatrix}.
|
||||
}
|
||||
\description{
|
||||
Helper function to supply data in batches of a data iterator when
|
||||
constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||
or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||
constructing a DMatrix from external memory through \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}
|
||||
or through \code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}.
|
||||
|
||||
This function is \bold{only} meant to be called inside of a callback function (which
|
||||
is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||
This function is \strong{only} meant to be called inside of a callback function (which
|
||||
is passed as argument to function \code{\link[=xgb.DataIter]{xgb.DataIter()}} to construct a data iterator)
|
||||
when constructing a DMatrix through external memory - otherwise, one should call
|
||||
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||
\code{\link[=xgb.DMatrix]{xgb.DMatrix()}} or \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}.
|
||||
|
||||
The object that results from calling this function directly is \bold{not} like
|
||||
The object that results from calling this function directly is \strong{not} like
|
||||
an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
|
||||
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||
|
||||
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
For more information and for example usage, see the documentation for \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
|
||||
}
|
||||
|
||||
@ -13,14 +13,15 @@ used to keep track of variables to determine how to handle the batches.
|
||||
For example, one might want to keep track of an iteration number in this environment in order
|
||||
to know which part of the data to pass next.}
|
||||
|
||||
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
|
||||
\item{f_next}{\verb{function(env)} which is responsible for:
|
||||
\itemize{
|
||||
\item Accessing or retrieving the next batch of data in the iterator.
|
||||
\item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
|
||||
\item Supplying this data by calling function \code{\link[=xgb.DataBatch]{xgb.DataBatch()}} on it and returning the result.
|
||||
\item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||
be done by modifiying variables in the \code{env} variable that is passed here.
|
||||
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
|
||||
when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||
calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
|
||||
calling \code{\link[=xgb.DataBatch]{xgb.DataBatch()}} when there are more batches in the line to be consumed.
|
||||
}}
|
||||
|
||||
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
|
||||
@ -32,7 +33,7 @@ Note that, after resetting the iterator, the batches will be accessed again, so
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
|
||||
be passed to \link{xgb.ExternalDMatrix}.
|
||||
be passed to \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
|
||||
}
|
||||
\description{
|
||||
Interface to create a custom data iterator in order to construct a DMatrix
|
||||
@ -41,11 +42,11 @@ from external memory.
|
||||
This function is responsible for generating an R object structure containing callback
|
||||
functions and an environment shared with them.
|
||||
|
||||
The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||
The output structure from this function is then meant to be passed to \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}},
|
||||
which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||
|
||||
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
For more information, and for a usage example, see the documentation for \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
|
||||
\code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}.
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.ExternalDMatrix}
|
||||
\alias{xgb.ExternalDMatrix}
|
||||
\name{xgb.ExtMemDMatrix}
|
||||
\alias{xgb.ExtMemDMatrix}
|
||||
\title{DMatrix from External Data}
|
||||
\usage{
|
||||
xgb.ExternalDMatrix(
|
||||
xgb.ExtMemDMatrix(
|
||||
data_iterator,
|
||||
cache_prefix = tempdir(),
|
||||
missing = NA,
|
||||
@ -12,7 +12,7 @@ xgb.ExternalDMatrix(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||
\item{data_iterator}{A data iterator structure as returned by \code{\link[=xgb.DataIter]{xgb.DataIter()}},
|
||||
which includes an environment shared between function calls, and functions to access
|
||||
the data in batches on-demand.}
|
||||
|
||||
@ -20,40 +20,39 @@ the data in batches on-demand.}
|
||||
|
||||
\item{missing}{A float value to represents missing values in data.
|
||||
|
||||
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||
Note that, while functions like \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} can take a generic \code{NA} and interpret it
|
||||
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||
it will not be adapted for different input types.
|
||||
|
||||
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}} and by
|
||||
\code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}, these integer missing values will not be treated as missing.
|
||||
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
}
|
||||
\value{
|
||||
An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||
An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
|
||||
held internally but accessed through the iterator when needed.
|
||||
}
|
||||
\description{
|
||||
Create a special type of xgboost 'DMatrix' object from external data
|
||||
supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||
Create a special type of XGBoost 'DMatrix' object from external data
|
||||
supplied by an \code{\link[=xgb.DataIter]{xgb.DataIter()}} object, potentially passed in batches from a
|
||||
bigger set that might not fit entirely in memory.
|
||||
|
||||
The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||
without being concatenated, but note that fields like 'label' \bold{will} be
|
||||
without being concatenated, but note that fields like 'label' \strong{will} be
|
||||
concatenated from multiple calls to the data iterator.
|
||||
|
||||
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
|
||||
# this custom environment will be passed to the iterator
|
||||
# functions at each call. It's up to the user to keep
|
||||
# This custom environment will be passed to the iterator
|
||||
# functions at each call. It is up to the user to keep
|
||||
# track of the iteration number in this environment.
|
||||
iterator_env <- as.environment(
|
||||
list(
|
||||
@ -106,7 +105,7 @@ data_iterator <- xgb.DataIter(
|
||||
cache_prefix <- tempdir()
|
||||
|
||||
# DMatrix will be constructed from the iterator's batches
|
||||
dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
|
||||
# After construction, can be used as a regular DMatrix
|
||||
params <- list(nthread = 1, objective = "reg:squarederror")
|
||||
@ -118,5 +117,5 @@ pred_dm <- predict(model, dm)
|
||||
pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}, \code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}
|
||||
}
|
||||
@ -13,26 +13,26 @@ xgb.QuantileDMatrix.from_iterator(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||
\item{data_iterator}{A data iterator structure as returned by \code{\link[=xgb.DataIter]{xgb.DataIter()}},
|
||||
which includes an environment shared between function calls, and functions to access
|
||||
the data in batches on-demand.}
|
||||
|
||||
\item{missing}{A float value to represents missing values in data.
|
||||
|
||||
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||
Note that, while functions like \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} can take a generic \code{NA} and interpret it
|
||||
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||
it will not be adapted for different input types.
|
||||
|
||||
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}} and by
|
||||
\code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}, these integer missing values will not be treated as missing.
|
||||
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
|
||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||
validation/test dataset with \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}. Supplying the training DMatrix
|
||||
as a reference means that the same quantisation applied to the training data is
|
||||
applied to the validation/test data}
|
||||
|
||||
@ -46,20 +46,20 @@ An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||
}
|
||||
\description{
|
||||
Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
|
||||
calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||
external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||
a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||
calling function \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}, with the same advantages and limitations) from
|
||||
external data supplied by \code{\link[=xgb.DataIter]{xgb.DataIter()}}, potentially passed in batches from
|
||||
a bigger set that might not fit entirely in memory, same way as \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
|
||||
|
||||
Note that, while external data will only be loaded through the iterator (thus the full data
|
||||
might not be held entirely in-memory), the quantized representation of the data will get
|
||||
created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||
version is typically lighter than the original data, so there might be cases in which this
|
||||
representation could potentially fit in memory even if the full data doesn't.
|
||||
representation could potentially fit in memory even if the full data does not.
|
||||
|
||||
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
|
||||
\link{xgb.QuantileDMatrix}
|
||||
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}, \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}},
|
||||
\code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@ xgb.attributes(object)
|
||||
xgb.attributes(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \strong{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{name}{A non-empty character string specifying which attribute is to be accessed.}
|
||||
|
||||
@ -36,18 +36,18 @@ or \code{NULL} if a model has no stored attributes.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
These methods allow to manipulate the key-value attribute strings of an xgboost model.
|
||||
These methods allow to manipulate the key-value attribute strings of an XGBoost model.
|
||||
}
|
||||
\details{
|
||||
The primary purpose of xgboost model attributes is to store some meta data about the model.
|
||||
The primary purpose of XGBoost model attributes is to store some meta data about the model.
|
||||
Note that they are a separate concept from the object attributes in R.
|
||||
Specifically, they refer to key-value strings that can be attached to an xgboost model,
|
||||
Specifically, they refer to key-value strings that can be attached to an XGBoost model,
|
||||
stored together with the model's binary representation, and accessed later
|
||||
(from R or any other interface).
|
||||
In contrast, any R attribute assigned to an R object of \code{xgb.Booster} class
|
||||
would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an xgboost model is an external memory object
|
||||
would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an XGBoost model is an external memory object
|
||||
and its serialization is handled externally.
|
||||
Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
||||
Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't
|
||||
change the value of that parameter for a model.
|
||||
Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters.
|
||||
|
||||
@ -57,16 +57,15 @@ but it doesn't delete the other existing attributes.
|
||||
Important: since this modifies the booster's C object, semantics for assignment here
|
||||
will differ from R's, as any object reference to the same booster will be modified
|
||||
too, while assignment of R attributes through \verb{attributes(model)$<attr> <- <value>}
|
||||
will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
|
||||
will follow the usual copy-on-write R semantics (see \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an
|
||||
example of these behaviors).
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/callbacks.R
|
||||
\name{xgb.cb.cv.predict}
|
||||
\alias{xgb.cb.cv.predict}
|
||||
\title{Callback for returning cross-validation based predictions.}
|
||||
\title{Callback for returning cross-validation based predictions}
|
||||
\usage{
|
||||
xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
|
||||
}
|
||||
@ -13,8 +13,8 @@ xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
|
||||
parameter to \link{predict.xgb.Booster}).}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.cv},
|
||||
but \bold{not} to \link{xgb.train}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.cv]{xgb.cv()}},
|
||||
but \strong{not} to \code{\link[=xgb.train]{xgb.train()}}.
|
||||
}
|
||||
\description{
|
||||
This callback function saves predictions for all of the test folds,
|
||||
@ -24,7 +24,7 @@ and also allows to save the folds' models.
|
||||
Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
|
||||
depending on the number of prediction outputs per data row. The order of predictions corresponds
|
||||
to the order of rows in the original dataset. Note that when a custom \code{folds} list is
|
||||
provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
|
||||
provided in \code{\link[=xgb.cv]{xgb.cv()}}, the predictions would only be returned properly when this list is a
|
||||
non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
|
||||
meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
|
||||
When some of the indices in the training dataset are not included into user-provided \code{folds},
|
||||
|
||||
@ -23,7 +23,7 @@ stopping. If not set, the last column would be used.
|
||||
Let's say the test data in \code{evals} was labelled as \code{dtest},
|
||||
and one wants to use the AUC in test data for early stopping regardless of where
|
||||
it is in the \code{evals}, then one of the following would need to be set:
|
||||
\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
|
||||
\code{metric_name = 'dtest-auc'} or \code{metric_name = 'dtest_auc'}.
|
||||
All dash '-' characters in metric names are considered equivalent to '_'.}
|
||||
|
||||
\item{verbose}{Whether to print the early stopping information.}
|
||||
@ -33,7 +33,7 @@ in the resulting object. If passing \code{FALSE}, will only keep the boosting ro
|
||||
up to the detected best iteration, discarding the ones that come after.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
This callback function determines the condition for early stopping.
|
||||
@ -49,7 +49,7 @@ The same values are also stored as R attributes as a result of the callback, plu
|
||||
attribute \code{stopped_by_max_rounds} which indicates whether an early stopping by the \code{stopping_rounds}
|
||||
condition occurred. Note that the \code{best_iteration} that is stored under R attributes will follow
|
||||
base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
|
||||
through \link{xgb.attr} or \link{xgb.attributes}.
|
||||
through \code{\link[=xgb.attr]{xgb.attr()}} or \code{\link[=xgb.attributes]{xgb.attributes()}}.
|
||||
|
||||
At least one dataset is required in \code{evals} for early stopping to work.
|
||||
}
|
||||
|
||||
@ -7,14 +7,14 @@
|
||||
xgb.cb.evaluation.log()
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
Callback for logging the evaluation history
|
||||
}
|
||||
\details{
|
||||
This callback creates a table with per-iteration evaluation metrics (see parameters
|
||||
\code{evals} and \code{feval} in \link{xgb.train}).
|
||||
\code{evals} and \code{feval} in \code{\link[=xgb.train]{xgb.train()}}).
|
||||
|
||||
Note: in the column names of the final data.table, the dash '-' character is replaced with
|
||||
the underscore '_' in order to make the column names more like regular R identifiers.
|
||||
|
||||
@ -7,13 +7,13 @@
|
||||
xgb.cb.gblinear.history(sparse = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{sparse}{when set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
|
||||
\item{sparse}{When set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
|
||||
Sparse format is useful when one expects only a subset of coefficients to be non-zero,
|
||||
when using the "thrifty" feature selector with fairly small number of top features
|
||||
selected per iteration.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
Callback for collecting coefficients history of a gblinear booster
|
||||
@ -37,11 +37,10 @@ will have column names matching with the feature names, otherwise (when there's
|
||||
one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
|
||||
(so e.g. column 'c1' for class '0' will be named 'c1:0').
|
||||
|
||||
With \code{xgb.train}, the output is either a dense or a sparse matrix.
|
||||
With with \code{xgb.cv}, it is a list (one element per each fold) of such
|
||||
matrices.
|
||||
With \code{\link[=xgb.train]{xgb.train()}}, the output is either a dense or a sparse matrix.
|
||||
With with \code{\link[=xgb.cv]{xgb.cv()}}, it is a list (one element per each fold) of such matrices.
|
||||
|
||||
Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
|
||||
Function \link{xgb.gblinear.history} provides an easy way to retrieve the
|
||||
outputs from this callback.
|
||||
}
|
||||
\examples{
|
||||
@ -53,57 +52,109 @@ data.table::setDTthreads(nthread)
|
||||
|
||||
# In the iris dataset, it is hard to linearly separate Versicolor class from the rest
|
||||
# without considering the 2nd order interactions:
|
||||
x <- model.matrix(Species ~ .^2, iris)[,-1]
|
||||
x <- model.matrix(Species ~ .^2, iris)[, -1]
|
||||
colnames(x)
|
||||
dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
|
||||
param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
|
||||
lambda = 0.0003, alpha = 0.0003, nthread = nthread)
|
||||
dtrain <- xgb.DMatrix(
|
||||
scale(x),
|
||||
label = 1 * (iris$Species == "versicolor"),
|
||||
nthread = nthread
|
||||
)
|
||||
param <- list(
|
||||
booster = "gblinear",
|
||||
objective = "reg:logistic",
|
||||
eval_metric = "auc",
|
||||
lambda = 0.0003,
|
||||
alpha = 0.0003,
|
||||
nthread = nthread
|
||||
)
|
||||
|
||||
# For 'shotgun', which is a default linear updater, using high eta values may result in
|
||||
# unstable behaviour in some datasets. With this simple dataset, however, the high learning
|
||||
# rate does not break the convergence, but allows us to illustrate the typical pattern of
|
||||
# "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
|
||||
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
|
||||
callbacks = list(xgb.cb.gblinear.history()))
|
||||
bst <- xgb.train(
|
||||
param,
|
||||
dtrain,
|
||||
list(tr = dtrain),
|
||||
nrounds = 200,
|
||||
eta = 1.,
|
||||
callbacks = list(xgb.cb.gblinear.history())
|
||||
)
|
||||
|
||||
# Extract the coefficients' path and plot them vs boosting iteration number:
|
||||
coef_path <- xgb.gblinear.history(bst)
|
||||
matplot(coef_path, type = 'l')
|
||||
matplot(coef_path, type = "l")
|
||||
|
||||
# With the deterministic coordinate descent updater, it is safer to use higher learning rates.
|
||||
# Will try the classical componentwise boosting which selects a single best feature per round:
|
||||
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
|
||||
updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
|
||||
callbacks = list(xgb.cb.gblinear.history()))
|
||||
matplot(xgb.gblinear.history(bst), type = 'l')
|
||||
bst <- xgb.train(
|
||||
param,
|
||||
dtrain,
|
||||
list(tr = dtrain),
|
||||
nrounds = 200,
|
||||
eta = 0.8,
|
||||
updater = "coord_descent",
|
||||
feature_selector = "thrifty",
|
||||
top_k = 1,
|
||||
callbacks = list(xgb.cb.gblinear.history())
|
||||
)
|
||||
matplot(xgb.gblinear.history(bst), type = "l")
|
||||
# Componentwise boosting is known to have similar effect to Lasso regularization.
|
||||
# Try experimenting with various values of top_k, eta, nrounds,
|
||||
# as well as different feature_selectors.
|
||||
|
||||
# For xgb.cv:
|
||||
bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
|
||||
callbacks = list(xgb.cb.gblinear.history()))
|
||||
bst <- xgb.cv(
|
||||
param,
|
||||
dtrain,
|
||||
nfold = 5,
|
||||
nrounds = 100,
|
||||
eta = 0.8,
|
||||
callbacks = list(xgb.cb.gblinear.history())
|
||||
)
|
||||
# coefficients in the CV fold #3
|
||||
matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
|
||||
matplot(xgb.gblinear.history(bst)[[3]], type = "l")
|
||||
|
||||
|
||||
#### Multiclass classification:
|
||||
#
|
||||
dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
|
||||
param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
||||
lambda = 0.0003, alpha = 0.0003, nthread = nthread)
|
||||
|
||||
param <- list(
|
||||
booster = "gblinear",
|
||||
objective = "multi:softprob",
|
||||
num_class = 3,
|
||||
lambda = 0.0003,
|
||||
alpha = 0.0003,
|
||||
nthread = nthread
|
||||
)
|
||||
|
||||
# For the default linear updater 'shotgun' it sometimes is helpful
|
||||
# to use smaller eta to reduce instability
|
||||
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
|
||||
callbacks = list(xgb.cb.gblinear.history()))
|
||||
bst <- xgb.train(
|
||||
param,
|
||||
dtrain,
|
||||
list(tr = dtrain),
|
||||
nrounds = 50,
|
||||
eta = 0.5,
|
||||
callbacks = list(xgb.cb.gblinear.history())
|
||||
)
|
||||
|
||||
# Will plot the coefficient paths separately for each class:
|
||||
matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
||||
matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
|
||||
matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
|
||||
matplot(xgb.gblinear.history(bst, class_index = 0), type = "l")
|
||||
matplot(xgb.gblinear.history(bst, class_index = 1), type = "l")
|
||||
matplot(xgb.gblinear.history(bst, class_index = 2), type = "l")
|
||||
|
||||
# CV:
|
||||
bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
|
||||
callbacks = list(xgb.cb.gblinear.history(FALSE)))
|
||||
bst <- xgb.cv(
|
||||
param,
|
||||
dtrain,
|
||||
nfold = 5,
|
||||
nrounds = 70,
|
||||
eta = 0.5,
|
||||
callbacks = list(xgb.cb.gblinear.history(FALSE))
|
||||
)
|
||||
# 1st fold of 1st class
|
||||
matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
|
||||
matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = "l")
|
||||
|
||||
}
|
||||
\seealso{
|
||||
|
||||
@ -7,12 +7,12 @@
|
||||
xgb.cb.print.evaluation(period = 1, showsd = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
\item{period}{results would be printed every number of periods}
|
||||
\item{period}{Results would be printed every number of periods.}
|
||||
|
||||
\item{showsd}{whether standard deviations should be printed (when available)}
|
||||
\item{showsd}{Whether standard deviations should be printed (when available).}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
The callback function prints the result of evaluation at every \code{period} iterations.
|
||||
|
||||
@ -2,12 +2,12 @@
|
||||
% Please edit documentation in R/callbacks.R
|
||||
\name{xgb.cb.reset.parameters}
|
||||
\alias{xgb.cb.reset.parameters}
|
||||
\title{Callback for resetting the booster's parameters at each iteration.}
|
||||
\title{Callback for resetting booster parameters at each iteration}
|
||||
\usage{
|
||||
xgb.cb.reset.parameters(new_params)
|
||||
}
|
||||
\arguments{
|
||||
\item{new_params}{a list where each element corresponds to a parameter that needs to be reset.
|
||||
\item{new_params}{List of parameters needed to be reset.
|
||||
Each element's value must be either a vector of values of length \code{nrounds}
|
||||
to be set at each iteration,
|
||||
or a function of two parameters \code{learning_rates(iteration, nrounds)}
|
||||
@ -15,10 +15,10 @@ which returns a new parameter value by using the current iteration number
|
||||
and the total number of boosting rounds.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
Callback for resetting the booster's parameters at each iteration.
|
||||
Callback for resetting booster parameters at each iteration
|
||||
}
|
||||
\details{
|
||||
Note that when training is resumed from some previous model, and a function is used to
|
||||
|
||||
@ -2,23 +2,22 @@
|
||||
% Please edit documentation in R/callbacks.R
|
||||
\name{xgb.cb.save.model}
|
||||
\alias{xgb.cb.save.model}
|
||||
\title{Callback for saving a model file.}
|
||||
\title{Callback for saving a model file}
|
||||
\usage{
|
||||
xgb.cb.save.model(save_period = 0, save_name = "xgboost.ubj")
|
||||
}
|
||||
\arguments{
|
||||
\item{save_period}{Save the model to disk after every
|
||||
\code{save_period} iterations; 0 means save the model at the end.}
|
||||
\item{save_period}{Save the model to disk after every \code{save_period} iterations;
|
||||
0 means save the model at the end.}
|
||||
|
||||
\item{save_name}{The name or path for the saved model file.
|
||||
It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
to include the integer iteration number in the file name.
|
||||
E.g., with \code{save_name} = 'xgboost_\%04d.model',
|
||||
It can contain a \code{\link[=sprintf]{sprintf()}} formatting specifier to include the integer
|
||||
iteration number in the file name. E.g., with \code{save_name = 'xgboost_\%04d.model'},
|
||||
the file saved at iteration 50 would be named "xgboost_0050.model".}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Callback} object, which can be passed to \link{xgb.train},
|
||||
but \bold{not} to \link{xgb.cv}.
|
||||
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}},
|
||||
but \strong{not} to \code{\link[=xgb.cv]{xgb.cv()}}.
|
||||
}
|
||||
\description{
|
||||
This callback function allows to save an xgb-model file, either periodically
|
||||
|
||||
@ -10,12 +10,12 @@ xgb.config(object)
|
||||
xgb.config(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
\item{object}{Object of class \code{xgb.Booster}.\strong{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{value}{An R list.}
|
||||
\item{value}{A list.}
|
||||
}
|
||||
\value{
|
||||
\code{xgb.config} will return the parameters as an R list.
|
||||
Parameters as a list.
|
||||
}
|
||||
\description{
|
||||
Accessors for model parameters as JSON string
|
||||
@ -25,7 +25,7 @@ Note that assignment is performed in-place on the booster C object, which unlike
|
||||
of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
|
||||
to the same booster will also get updated.
|
||||
|
||||
See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
See \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}} for an example of this behavior.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
@ -35,9 +35,8 @@ nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
|
||||
@ -16,14 +16,18 @@ functions called on that copy will not affect the \code{model} variable.
|
||||
\description{
|
||||
Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
C object pointer contained will be a different object, and hence functions
|
||||
like \link{xgb.attr} will not affect the object from which it was copied.
|
||||
like \code{\link[=xgb.attr]{xgb.attr()}} will not affect the object from which it was copied.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
|
||||
data(mtcars)
|
||||
|
||||
y <- mtcars$mpg
|
||||
x <- mtcars[, -1]
|
||||
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1),
|
||||
|
||||
@ -7,17 +7,18 @@
|
||||
xgb.create.features(model, data, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{decision tree boosting model learned on the original data}
|
||||
\item{model}{Decision tree boosting model learned on the original data.}
|
||||
|
||||
\item{data}{original data (usually provided as a \code{dgCMatrix} matrix)}
|
||||
\item{data}{Original data (usually provided as a \code{dgCMatrix} matrix).}
|
||||
|
||||
\item{...}{currently not used}
|
||||
\item{...}{Currently not used.}
|
||||
}
|
||||
\value{
|
||||
\code{dgCMatrix} matrix including both the original data and the new features.
|
||||
A \code{dgCMatrix} matrix including both the original data and the new features.
|
||||
}
|
||||
\description{
|
||||
May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
|
||||
May improve the learning by adding new features to the training data based on the
|
||||
decision trees from a previously learned model.
|
||||
}
|
||||
\details{
|
||||
This is the function inspired from the paragraph 3.1 of the paper:
|
||||
@ -44,11 +45,11 @@ For example, consider the boosted tree model in Figure 1 with 2 subtrees,
|
||||
where the first subtree has 3 leafs and the second 2 leafs. If an
|
||||
instance ends up in leaf 2 in the first subtree and leaf 1 in
|
||||
second subtree, the overall input to the linear classifier will
|
||||
be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||
be the binary vector \verb{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||
correspond to the leaves of the first subtree and last 2 to
|
||||
those of the second subtree.
|
||||
|
||||
\link{...}
|
||||
...
|
||||
|
||||
We can understand boosted decision tree
|
||||
based transformation as a supervised feature encoding that
|
||||
@ -57,15 +58,16 @@ vector. A traversal from root node to a leaf node represents
|
||||
a rule on certain features."
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
data(agaricus.test, package = "xgboost")
|
||||
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
param <- list(max_depth=2, eta=1, objective='binary:logistic')
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
nrounds = 4
|
||||
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||
|
||||
@ -26,141 +26,136 @@ xgb.cv(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
\item{params}{The list of parameters. The complete list of parameters is available in the
|
||||
\href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}.
|
||||
Below is a shorter summary:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\item \code{objective}: Objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss.
|
||||
\item \code{binary:logistic} logistic regression for classification.
|
||||
\item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
\item \code{reg:squarederror}: Regression with squared loss.
|
||||
\item \code{binary:logistic}: Logistic regression for classification.
|
||||
}
|
||||
|
||||
See \code{\link{xgb.train}} for further details.
|
||||
See also demo/ for walkthrough example in R.
|
||||
See \code{\link[=xgb.train]{xgb.train()}} for complete list of objectives.
|
||||
\item \code{eta}: Step size of each boosting step
|
||||
\item \code{max_depth}: Maximum depth of the tree
|
||||
\item \code{nthread}: Number of threads used in training. If not set, all threads are used
|
||||
}
|
||||
|
||||
See \code{\link[=xgb.train]{xgb.train()}} for further details.
|
||||
See also demo for walkthrough example in R.
|
||||
|
||||
Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if
|
||||
supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
|
||||
system - thus, for reproducible results, one needs to call the \code{set.seed} function beforehand.}
|
||||
system - thus, for reproducible results, one needs to call the \code{\link[=set.seed]{set.seed()}} function beforehand.}
|
||||
|
||||
\item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required
|
||||
for model training by the objective.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
|
||||
or `xgb.ExternalDMatrix` are not supported here.
|
||||
}\if{html}{\out{</div>}}}
|
||||
Note that only the basic \code{xgb.DMatrix} class is supported - variants such as \code{xgb.QuantileDMatrix}
|
||||
or \code{xgb.ExtMemDMatrix} are not supported here.}
|
||||
|
||||
\item{nrounds}{the max number of iterations}
|
||||
\item{nrounds}{The max number of iterations.}
|
||||
|
||||
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
||||
\item{nfold}{The original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
||||
|
||||
\item{prediction}{A logical value indicating whether to return the test fold predictions
|
||||
from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.}
|
||||
from each CV model. This parameter engages the \code{\link[=xgb.cb.cv.predict]{xgb.cb.cv.predict()}} callback.}
|
||||
|
||||
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
|
||||
\item{showsd}{Logical value whether to show standard deviation of cross validation.}
|
||||
|
||||
\item{metrics, }{list of evaluation metrics to be used in cross validation,
|
||||
\item{metrics}{List of evaluation metrics to be used in cross validation,
|
||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
Possible options are:
|
||||
\itemize{
|
||||
\item \code{error} binary classification error rate
|
||||
\item \code{rmse} Rooted mean square error
|
||||
\item \code{logloss} negative log-likelihood function
|
||||
\item \code{mae} Mean absolute error
|
||||
\item \code{mape} Mean absolute percentage error
|
||||
\item \code{auc} Area under curve
|
||||
\item \code{aucpr} Area under PR curve
|
||||
\item \code{merror} Exact matching error, used to evaluate multi-class classification
|
||||
\item \code{error}: Binary classification error rate
|
||||
\item \code{rmse}: Root mean square error
|
||||
\item \code{logloss}: Negative log-likelihood function
|
||||
\item \code{mae}: Mean absolute error
|
||||
\item \code{mape}: Mean absolute percentage error
|
||||
\item \code{auc}: Area under curve
|
||||
\item \code{aucpr}: Area under PR curve
|
||||
\item \code{merror}: Exact matching error used to evaluate multi-class classification
|
||||
}}
|
||||
|
||||
\item{obj}{customized objective function. Returns gradient and second order
|
||||
\item{obj}{Customized objective function. Returns gradient and second order
|
||||
gradient with given prediction and dtrain.}
|
||||
|
||||
\item{feval}{customized evaluation function. Returns
|
||||
\code{list(metric='metric-name', value='metric-value')} with given
|
||||
prediction and dtrain.}
|
||||
\item{feval}{Customized evaluation function. Returns
|
||||
\code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.}
|
||||
|
||||
\item{stratified}{A \code{boolean} indicating whether sampling of folds should be stratified
|
||||
\item{stratified}{Logical flag indicating whether sampling of folds should be stratified
|
||||
by the values of outcome labels. For real-valued labels in regression objectives,
|
||||
stratification will be done by discretizing the labels into up to 5 buckets beforehand.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
|
||||
objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
|
||||
`FALSE` otherwise.
|
||||
If passing "auto", will be set to \code{TRUE} if the objective in \code{params} is a classification
|
||||
objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
|
||||
\code{FALSE} otherwise.
|
||||
|
||||
This parameter is ignored when `data` has a `group` field - in such case, the splitting
|
||||
will be based on whole groups (note that this might make the folds have different sizes).
|
||||
This parameter is ignored when \code{data} has a \code{group} field - in such case, the splitting
|
||||
will be based on whole groups (note that this might make the folds have different sizes).
|
||||
|
||||
Value `TRUE` here is \\bold\{not\} supported for custom objectives.
|
||||
}\if{html}{\out{</div>}}}
|
||||
Value \code{TRUE} here is \strong{not} supported for custom objectives.}
|
||||
|
||||
\item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
(each element must be a vector of test fold's indices). When folds are supplied,
|
||||
the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
\item{folds}{List with pre-defined CV folds (each element must be a vector of test fold's indices).
|
||||
When folds are supplied, the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If `data` has a `group` field and the objective requires this field, each fold (list element)
|
||||
must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
|
||||
and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
|
||||
the resulting DMatrices.
|
||||
}\if{html}{\out{</div>}}}
|
||||
If \code{data} has a \code{group} field and the objective requires this field, each fold (list element)
|
||||
must additionally have two attributes (retrievable through \code{attributes}) named \code{group_test}
|
||||
and \code{group_train}, which should hold the \code{group} to assign through \code{\link[=setinfo.xgb.DMatrix]{setinfo.xgb.DMatrix()}} to
|
||||
the resulting DMatrices.}
|
||||
|
||||
\item{train_folds}{\code{list} list specifying which indicies to use for training. If \code{NULL}
|
||||
\item{train_folds}{List specifying which indices to use for training. If \code{NULL}
|
||||
(the default) all indices not specified in \code{folds} will be used for training.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ This is not supported when `data` has `group` field.
|
||||
}\if{html}{\out{</div>}}}
|
||||
This is not supported when \code{data} has \code{group} field.}
|
||||
|
||||
\item{verbose}{\code{boolean}, print the statistics during the process}
|
||||
\item{verbose}{Logical flag. Should statistics be printed during the process?}
|
||||
|
||||
\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||
\item{print_every_n}{Print each nth iteration evaluation messages when \code{verbose > 0}.
|
||||
Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
\code{\link{xgb.cb.print.evaluation}} callback.}
|
||||
\code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback.}
|
||||
|
||||
\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
|
||||
If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||
doesn't improve for \code{k} rounds.
|
||||
Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
|
||||
Setting this parameter engages the \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback.}
|
||||
|
||||
\item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
|
||||
then this parameter must be set as well.
|
||||
When it is \code{TRUE}, it means the larger the evaluation score the better.
|
||||
This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
|
||||
This parameter is passed to the \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback.}
|
||||
|
||||
\item{callbacks}{a list of callback functions to perform various task during boosting.
|
||||
See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
|
||||
\item{callbacks}{A list of callback functions to perform various task during boosting.
|
||||
See \code{\link[=xgb.Callback]{xgb.Callback()}}. Some of the callbacks are automatically created depending on the
|
||||
parameters' values. User can provide either existing or their own callback methods in order
|
||||
to customize the training process.}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
\item{...}{Other parameters to pass to \code{params}.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.cv.synchronous} with the following elements:
|
||||
An object of class 'xgb.cv.synchronous' with the following elements:
|
||||
\itemize{
|
||||
\item \code{call} a function call.
|
||||
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
|
||||
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
\item \code{call}: Function call.
|
||||
\item \code{params}: Parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link[=xgb.cb.reset.parameters]{xgb.cb.reset.parameters()}} callback.
|
||||
\item \code{evaluation_log}: Evaluation history stored as a \code{data.table} with the
|
||||
first column corresponding to iteration number and the rest corresponding to the
|
||||
CV-based evaluation means and standard deviations for the training and test CV-sets.
|
||||
It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
|
||||
\item \code{niter} number of boosting iterations.
|
||||
\item \code{nfeatures} number of features in training data.
|
||||
\item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
|
||||
It is created by the \code{\link[=xgb.cb.evaluation.log]{xgb.cb.evaluation.log()}} callback.
|
||||
\item \code{niter}: Number of boosting iterations.
|
||||
\item \code{nfeatures}: Number of features in training data.
|
||||
\item \code{folds}: The list of CV folds' indices - either those passed through the \code{folds}
|
||||
parameter or randomly generated.
|
||||
\item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
\item \code{best_iteration}: Iteration number with the best evaluation metric value
|
||||
(only available with early stopping).
|
||||
}
|
||||
|
||||
Plus other potential elements that are the result of callbacks, such as a list \code{cv_predict} with
|
||||
a sub-element \code{pred} when passing \code{prediction = TRUE}, which is added by the \link{xgb.cb.cv.predict}
|
||||
a sub-element \code{pred} when passing \code{prediction = TRUE}, which is added by the \code{\link[=xgb.cb.cv.predict]{xgb.cb.cv.predict()}}
|
||||
callback (note that one can also pass it manually under \code{callbacks} with different settings,
|
||||
such as saving also the models created during cross validation); or a list \code{early_stop} which
|
||||
will contain elements such as \code{best_iteration} when using the early stopping callback (\link{xgb.cb.early.stop}).
|
||||
will contain elements such as \code{best_iteration} when using the early stopping callback (\code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}}).
|
||||
}
|
||||
\description{
|
||||
The cross validation function of xgboost.
|
||||
@ -179,11 +174,20 @@ All observations are used for both training and validation.
|
||||
Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
|
||||
max_depth = 3, eta = 1, objective = "binary:logistic")
|
||||
|
||||
cv <- xgb.cv(
|
||||
data = dtrain,
|
||||
nrounds = 3,
|
||||
nthread = 2,
|
||||
nfold = 5,
|
||||
metrics = list("rmse","auc"),
|
||||
max_depth = 3,
|
||||
eta = 1,objective = "binary:logistic"
|
||||
)
|
||||
print(cv)
|
||||
print(cv, verbose=TRUE)
|
||||
print(cv, verbose = TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.dump.R
|
||||
\name{xgb.dump}
|
||||
\alias{xgb.dump}
|
||||
\title{Dump an xgboost model in text format.}
|
||||
\title{Dump an XGBoost model in text format.}
|
||||
\usage{
|
||||
xgb.dump(
|
||||
model,
|
||||
@ -14,43 +14,51 @@ xgb.dump(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
\item{model}{The model object.}
|
||||
|
||||
\item{fname}{the name of the text file where to save the model text dump.
|
||||
If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
|
||||
\item{fname}{The name of the text file where to save the model text dump.
|
||||
If not provided or set to \code{NULL}, the model is returned as a character vector.}
|
||||
|
||||
\item{fmap}{feature map file representing feature types.
|
||||
See demo/ for walkthrough example in R, and
|
||||
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||
for example Format.}
|
||||
\item{fmap}{Feature map file representing feature types. See demo/ for a walkthrough
|
||||
example in R, and \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||
to see an example of the value.}
|
||||
|
||||
\item{with_stats}{whether to dump some additional statistics about the splits.
|
||||
\item{with_stats}{Whether to dump some additional statistics about the splits.
|
||||
When this option is on, the model dump contains two additional values:
|
||||
gain is the approximate loss function gain we get in each split;
|
||||
cover is the sum of second order gradient in each node.}
|
||||
|
||||
\item{dump_format}{either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
\item{dump_format}{Either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
|
||||
Format 'dot' for a single tree can be passed directly to packages that consume this format
|
||||
for graph visualization, such as function \code{\link[DiagrammeR:grViz]{DiagrammeR::grViz()}}}
|
||||
for graph visualization, such as function \code{DiagrammeR::grViz()}}
|
||||
|
||||
\item{...}{currently not used}
|
||||
\item{...}{Currently not used}
|
||||
}
|
||||
\value{
|
||||
If fname is not provided or set to \code{NULL} the function will return the model
|
||||
as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
as a character vector. Otherwise it will return \code{TRUE}.
|
||||
}
|
||||
\description{
|
||||
Dump an xgboost model in text format.
|
||||
Dump an XGBoost model in text format.
|
||||
}
|
||||
\examples{
|
||||
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
data(agaricus.test, package = "xgboost")
|
||||
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
# save the model in file 'xgb.model.dump'
|
||||
dump_path = file.path(tempdir(), 'model.dump')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
@ -59,7 +67,7 @@ xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
print(xgb.dump(bst, with_stats = TRUE))
|
||||
|
||||
# print in JSON format:
|
||||
cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||
cat(xgb.dump(bst, with_stats = TRUE, dump_format = "json"))
|
||||
|
||||
# plot first tree leveraging the 'dot' format
|
||||
if (requireNamespace('DiagrammeR', quietly = TRUE)) {
|
||||
|
||||
@ -2,24 +2,24 @@
|
||||
% Please edit documentation in R/callbacks.R
|
||||
\name{xgb.gblinear.history}
|
||||
\alias{xgb.gblinear.history}
|
||||
\title{Extract gblinear coefficients history.}
|
||||
\title{Extract gblinear coefficients history}
|
||||
\usage{
|
||||
xgb.gblinear.history(model, class_index = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
|
||||
loaded from \link{xgb.load} or \link{xgb.load.raw}.}
|
||||
\item{model}{Either an \code{xgb.Booster} or a result of \code{\link[=xgb.cv]{xgb.cv()}}, trained
|
||||
using the \link{xgb.cb.gblinear.history} callback, but \strong{not} a booster
|
||||
loaded from \code{\link[=xgb.load]{xgb.load()}} or \code{\link[=xgb.load.raw]{xgb.load.raw()}}.}
|
||||
|
||||
\item{class_index}{zero-based class index to extract the coefficients for only that
|
||||
specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
specific class in a multinomial multiclass model. When it is \code{NULL}, all the
|
||||
coefficients are returned. Has no effect in non-multiclass models.}
|
||||
}
|
||||
\value{
|
||||
For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
|
||||
For an \code{\link[=xgb.train]{xgb.train()}} result, a matrix (either dense or sparse) with the columns
|
||||
corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
|
||||
|
||||
For an \link{xgb.cv} result, a list of such matrices is returned with the elements
|
||||
For an \code{\link[=xgb.cv]{xgb.cv()}} result, a list of such matrices is returned with the elements
|
||||
corresponding to CV folds.
|
||||
|
||||
When there is more than one coefficient per feature (e.g. multi-class classification)
|
||||
@ -31,15 +31,15 @@ coefficients N+1 through 2N for the second class, and so on).
|
||||
\description{
|
||||
A helper function to extract the matrix of linear coefficients' history
|
||||
from a gblinear model created while using the \link{xgb.cb.gblinear.history}
|
||||
callback (which must be added manually as by default it's not used).
|
||||
callback (which must be added manually as by default it is not used).
|
||||
}
|
||||
\details{
|
||||
Note that this is an R-specific function that relies on R attributes that
|
||||
are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
or \link{xgb.load.raw}.
|
||||
are not saved when using XGBoost's own serialization functions like \code{\link[=xgb.load]{xgb.load()}}
|
||||
or \code{\link[=xgb.load.raw]{xgb.load.raw()}}.
|
||||
|
||||
In order for a serialized model to be accepted by this function, one must use R
|
||||
serializers such as \link{saveRDS}.
|
||||
serializers such as \code{\link[=saveRDS]{saveRDS()}}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
xgb.get.DMatrix.data(dmat)
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \code{\link[=xgb.DMatrix]{xgb.DMatrix()}}.}
|
||||
}
|
||||
\value{
|
||||
The data held in the DMatrix, as a sparse CSR matrix (class \code{dgRMatrix}
|
||||
|
||||
@ -7,10 +7,10 @@
|
||||
xgb.get.DMatrix.num.non.missing(dmat)
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \code{\link[=xgb.DMatrix]{xgb.DMatrix()}}.}
|
||||
}
|
||||
\value{
|
||||
The number of non-missing entries in the DMatrix
|
||||
The number of non-missing entries in the DMatrix.
|
||||
}
|
||||
\description{
|
||||
Get Number of Non-Missing Entries in DMatrix
|
||||
|
||||
@ -7,15 +7,14 @@
|
||||
xgb.get.DMatrix.qcut(dmat, output = c("list", "arrays"))
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \code{\link[=xgb.DMatrix]{xgb.DMatrix()}}.}
|
||||
|
||||
\item{output}{Output format for the quantile cuts. Possible options are:\itemize{
|
||||
\item \code{"list"} will return the output as a list with one entry per column, where
|
||||
each column will have a numeric vector with the cuts. The list will be named if
|
||||
\code{dmat} has column names assigned to it.
|
||||
\item{output}{Output format for the quantile cuts. Possible options are:
|
||||
\itemize{
|
||||
\item "list"\verb{will return the output as a list with one entry per column, where each column will have a numeric vector with the cuts. The list will be named if}dmat` has column names assigned to it.
|
||||
\item \code{"arrays"} will return a list with entries \code{indptr} (base-0 indexing) and
|
||||
\code{data}. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||
\code{indptr[i]+1} to \code{indptr[i+1]}.
|
||||
\code{ indptr[i]+1} to \code{indptr[i+1]}.
|
||||
}}
|
||||
}
|
||||
\value{
|
||||
@ -23,7 +22,7 @@ The quantile cuts, in the format specified by parameter \code{output}.
|
||||
}
|
||||
\description{
|
||||
Get the quantile cuts (a.k.a. borders) from an \code{xgb.DMatrix}
|
||||
that has been quantized for the histogram method (\code{tree_method="hist"}).
|
||||
that has been quantized for the histogram method (\code{tree_method = "hist"}).
|
||||
|
||||
These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||
boundaries which are used to determine assignment condition \verb{border_low < x < border_high}.
|
||||
@ -36,8 +35,8 @@ which will be output in sorted order from lowest to highest.
|
||||
Different columns can have different numbers of bins according to their range.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
@ -45,11 +44,7 @@ dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
# DMatrix is not quantized right away, but will be once a hist model is generated
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
tree_method = "hist",
|
||||
max_bin = 8,
|
||||
nthread = 1
|
||||
),
|
||||
params = list(tree_method = "hist", max_bin = 8, nthread = 1),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
|
||||
@ -13,13 +13,13 @@ xgb.get.num.boosted.rounds(model)
|
||||
\item{model, x}{A fitted \code{xgb.Booster} model.}
|
||||
}
|
||||
\value{
|
||||
The number of rounds saved in the model, as an integer.
|
||||
The number of rounds saved in the model as an integer.
|
||||
}
|
||||
\description{
|
||||
Get number of boosting in a fitted booster
|
||||
}
|
||||
\details{
|
||||
Note that setting booster parameters related to training
|
||||
continuation / updates through \link{xgb.parameters<-} will reset the
|
||||
continuation / updates through \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} will reset the
|
||||
number of rounds to zero.
|
||||
}
|
||||
|
||||
@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
|
||||
# binomial classification using "gbtree":
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
@ -83,9 +82,8 @@ bst <- xgboost(
|
||||
xgb.importance(model = bst)
|
||||
|
||||
# binomial classification using "gblinear":
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
booster = "gblinear",
|
||||
eta = 0.3,
|
||||
nthread = 1,
|
||||
@ -97,9 +95,11 @@ xgb.importance(model = bst)
|
||||
# multiclass classification using "gbtree":
|
||||
nclass <- 3
|
||||
nrounds <- 10
|
||||
mbst <- xgboost(
|
||||
data = as.matrix(iris[, -5]),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
mbst <- xgb.train(
|
||||
data = xgb.DMatrix(
|
||||
as.matrix(iris[, -5]),
|
||||
label = as.numeric(iris$Species) - 1
|
||||
),
|
||||
max_depth = 3,
|
||||
eta = 0.2,
|
||||
nthread = 2,
|
||||
@ -123,9 +123,11 @@ xgb.importance(
|
||||
)
|
||||
|
||||
# multiclass classification using "gblinear":
|
||||
mbst <- xgboost(
|
||||
data = scale(as.matrix(iris[, -5])),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
mbst <- xgb.train(
|
||||
data = xgb.DMatrix(
|
||||
scale(as.matrix(iris[, -5])),
|
||||
label = as.numeric(iris$Species) - 1
|
||||
),
|
||||
booster = "gblinear",
|
||||
eta = 0.2,
|
||||
nthread = 1,
|
||||
|
||||
@ -12,30 +12,33 @@ xgb.is.same.Booster(obj1, obj2)
|
||||
\item{obj2}{Booster model to compare with \code{obj1}.}
|
||||
}
|
||||
\value{
|
||||
Either \code{TRUE} or \code{FALSE} according to whether the two boosters share
|
||||
the underlying C object.
|
||||
Either \code{TRUE} or \code{FALSE} according to whether the two boosters share the
|
||||
underlying C object.
|
||||
}
|
||||
\description{
|
||||
Checks whether two booster objects refer to the same underlying C object.
|
||||
}
|
||||
\details{
|
||||
As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
|
||||
As booster objects (as returned by e.g. \code{\link[=xgb.train]{xgb.train()}}) contain an R 'externalptr'
|
||||
object, they don't follow typical copy-on-write semantics of other R objects - that is, if
|
||||
one assigns a booster to a different variable and modifies that new variable through in-place
|
||||
methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
|
||||
methods like \code{\link[=xgb.attr<-]{xgb.attr<-()}}, the modification will be applied to both the old and the new
|
||||
variable, unlike typical R assignments which would only modify the latter.
|
||||
|
||||
This function allows checking whether two booster objects share the same 'externalptr',
|
||||
regardless of the R attributes that they might have.
|
||||
|
||||
In order to duplicate a booster in such a way that the copy wouldn't share the same
|
||||
'externalptr', one can use function \link{xgb.copy.Booster}.
|
||||
'externalptr', one can use function \code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}}.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
|
||||
data(mtcars)
|
||||
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
|
||||
model <- xgb.train(
|
||||
params = list(nthread = 1),
|
||||
data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
@ -55,5 +58,5 @@ xgb.attr(model, "my_attr") # gets modified
|
||||
xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.copy.Booster}
|
||||
\code{\link[=xgb.copy.Booster]{xgb.copy.Booster()}}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user