Merge branch 'master' of ssh://github.com/tqchen/xgboost
This commit is contained in:
commit
9a85c108e2
16
CHANGES.md
16
CHANGES.md
@ -21,8 +21,16 @@ xgboost-0.3
|
|||||||
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
|
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
|
||||||
* Add R module
|
* Add R module
|
||||||
|
|
||||||
in progress version
|
in progress 0.4
|
||||||
=====
|
=====
|
||||||
* Distributed version
|
* Distributed version of xgboost that runs on YARN, scales to billions of examples
|
||||||
* Feature importance visualization in R module, thanks to Michael Benesty
|
* Direct save/load data and model from/to S3 and HDFS
|
||||||
* Predict leaf inde
|
* Feature importance visualization in R module, by Michael Benesty
|
||||||
|
* Predict leaf index
|
||||||
|
* Poisson regression for counts data
|
||||||
|
* Early stopping option in training
|
||||||
|
* Native save load support in R and python
|
||||||
|
- xgboost models now can be saved using save/load in R
|
||||||
|
- xgboost python model is now pickable
|
||||||
|
* sklearn wrapper is supported in python module
|
||||||
|
* Experimental External memory version
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: eXtreme Gradient Boosting
|
Title: eXtreme Gradient Boosting
|
||||||
Version: 0.3-4
|
Version: 0.4-0
|
||||||
Date: 2014-12-28
|
Date: 2014-12-28
|
||||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
|
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
|
||||||
Maintainer: Tong He <hetong007@gmail.com>
|
Maintainer: Tong He <hetong007@gmail.com>
|
||||||
|
|||||||
@ -16,11 +16,11 @@
|
|||||||
#' 2.1. Parameter for Tree Booster
|
#' 2.1. Parameter for Tree Booster
|
||||||
#'
|
#'
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3
|
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
||||||
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||||
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
||||||
#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||||
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1
|
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||||
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||||
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||||
#' }
|
#' }
|
||||||
|
|||||||
@ -23,11 +23,11 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
|||||||
2.1. Parameter for Tree Booster
|
2.1. Parameter for Tree Booster
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3
|
\item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
||||||
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||||
\item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
\item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||||
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1
|
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||||
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,11 @@
|
|||||||
---
|
---
|
||||||
title: "Understanding XGBoost Model on Otto Dataset"
|
title: "Understanding XGBoost Model on Otto Dataset"
|
||||||
author: "Michaël Benesty"
|
author: "Michaël Benesty"
|
||||||
output: html_document
|
output:
|
||||||
|
rmarkdown::html_vignette:
|
||||||
|
css: ../../R-package/vignettes/vignette.css
|
||||||
|
number_sections: yes
|
||||||
|
toc: yes
|
||||||
---
|
---
|
||||||
|
|
||||||
Introduction
|
Introduction
|
||||||
@ -9,16 +13,15 @@ Introduction
|
|||||||
|
|
||||||
**XGBoost** is an implementation of the famous gradient boosting algorithm. This model is often described as a *blackbox*, meaning it works well but it is not trivial to understand how. Indeed, the model is made of hundreds (thousands?) of decision trees. You may wonder how possible a human would be able to have a general view of the model?
|
**XGBoost** is an implementation of the famous gradient boosting algorithm. This model is often described as a *blackbox*, meaning it works well but it is not trivial to understand how. Indeed, the model is made of hundreds (thousands?) of decision trees. You may wonder how possible a human would be able to have a general view of the model?
|
||||||
|
|
||||||
While XGBoost is known for its fast speed and accurate predictive power. It also comes with various functions to help you understand the model.
|
While XGBoost is known for its fast speed and accurate predictive power, it also comes with various functions to help you understand the model.
|
||||||
The purpose of this RMarkdown document is to demonstrate how we can leverage the functions already implemented in **XGBoost R** package for that purpose. Of course, everything showed below can be applied to the dataset you may have to manipulate at work or wherever!
|
The purpose of this RMarkdown document is to demonstrate how easily we can leverage the functions already implemented in **XGBoost R** package. Of course, everything showed below can be applied to the dataset you may have to manipulate at work or wherever!
|
||||||
|
|
||||||
First we will train a model on the **OTTO** dataset, then we will generate two vizualisations to get a clue of what is important to the model, finally, we will see how we can leverage these information.
|
|
||||||
|
|
||||||
|
First we will prepare the **Otto** dataset and train a model, then we will generate two vizualisations to get a clue of what is important to the model, finally, we will see how we can leverage these information.
|
||||||
|
|
||||||
Preparation of the data
|
Preparation of the data
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
This part is based on the tutorial example by [Tong He](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-otto/otto_train_pred.R)
|
This part is based on the **R** tutorial example by [Tong He](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-otto/otto_train_pred.R)
|
||||||
|
|
||||||
First, let's load the packages and the dataset.
|
First, let's load the packages and the dataset.
|
||||||
|
|
||||||
@ -30,29 +33,30 @@ require(magrittr)
|
|||||||
train <- fread('data/train.csv', header = T, stringsAsFactors = F)
|
train <- fread('data/train.csv', header = T, stringsAsFactors = F)
|
||||||
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
|
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
|
||||||
```
|
```
|
||||||
> `magrittr` and `data.table` are here to make the code cleaner and more rapid.
|
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
||||||
|
|
||||||
Let's see what is in this dataset.
|
Let's explore the dataset.
|
||||||
|
|
||||||
```{r explore}
|
```{r explore}
|
||||||
# Train dataset dimensions
|
# Train dataset dimensions
|
||||||
dim(train)
|
dim(train)
|
||||||
|
|
||||||
# Training content
|
# Training content
|
||||||
train[1:6,1:5, with =F]
|
train[1:6, 1:5, with =F]
|
||||||
|
|
||||||
# Test dataset dimensions
|
# Test dataset dimensions
|
||||||
dim(train)
|
dim(train)
|
||||||
|
|
||||||
# Test content
|
# Test content
|
||||||
test[1:6,1:5, with =F]
|
test[1:6, 1:5, with =F]
|
||||||
```
|
```
|
||||||
> We only display the 6 first rows and 5 first columns for convenience
|
> We only display the 6 first rows and 5 first columns for convenience
|
||||||
|
|
||||||
Each column represents a feature measured by an integer. Each row is a product.
|
Each *column* represents a feature measured by an integer. Each *row* is an **Otto** product.
|
||||||
|
|
||||||
Obviously the first column (`ID`) doesn't contain any useful information.
|
Obviously the first column (`ID`) doesn't contain any useful information.
|
||||||
To let the algorithm focus on real stuff, we will delete the column.
|
|
||||||
|
To let the algorithm focus on real stuff, we will delete it.
|
||||||
|
|
||||||
```{r clean, results='hide'}
|
```{r clean, results='hide'}
|
||||||
# Delete ID column in training dataset
|
# Delete ID column in training dataset
|
||||||
@ -62,7 +66,7 @@ train[, id := NULL]
|
|||||||
test[, id := NULL]
|
test[, id := NULL]
|
||||||
```
|
```
|
||||||
|
|
||||||
According to the `OTTO` challenge description, we have here a multi class classification challenge. We need to extract the labels (here the name of the different classes) from the dataset. We only have two files (test and training), it seems logical that the training file contains the class we are looking for. Usually the labels is in the first or the last column. Let's check the content of the last column.
|
According to its description, the **Otto** challenge is a multi class classification challenge. We need to extract the labels (here the name of the different classes) from the dataset. We only have two files (test and training), it seems logical that the training file contains the class we are looking for. Usually the labels is in the first or the last column. We already know what is in the first column, let's check the content of the last one.
|
||||||
|
|
||||||
```{r searchLabel}
|
```{r searchLabel}
|
||||||
# Check the content of the last column
|
# Check the content of the last column
|
||||||
@ -71,7 +75,7 @@ train[1:6, ncol(train), with = F]
|
|||||||
nameLastCol <- names(train)[ncol(train)]
|
nameLastCol <- names(train)[ncol(train)]
|
||||||
```
|
```
|
||||||
|
|
||||||
The class are provided as character string in the **`r ncol(train)`**th column called **`r nameLastCol`**. As you may know, **XGBoost** doesn't support anything else than numbers. So we will convert classes to integers. Moreover, according to the documentation, it should start at 0.
|
The classes are provided as character string in the **`r ncol(train)`**th column called **`r nameLastCol`**. As you may know, **XGBoost** doesn't support anything else than numbers. So we will convert classes to integers. Moreover, according to the documentation, it should start at 0.
|
||||||
|
|
||||||
For that purpose, we will:
|
For that purpose, we will:
|
||||||
|
|
||||||
@ -81,19 +85,19 @@ For that purpose, we will:
|
|||||||
* remove 1 to the new value
|
* remove 1 to the new value
|
||||||
|
|
||||||
```{r classToIntegers}
|
```{r classToIntegers}
|
||||||
# Convert to classes to numbers
|
# Convert from classes to numbers
|
||||||
y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
|
y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
|
||||||
# Display the first 5 levels
|
# Display the first 5 levels
|
||||||
y[1:5]
|
y[1:5]
|
||||||
```
|
```
|
||||||
|
|
||||||
We remove label column from training dataset, otherwise XGBoost would use it to guess the labels!!!
|
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
||||||
|
|
||||||
```{r deleteCols, results='hide'}
|
```{r deleteCols, results='hide'}
|
||||||
train[, nameLastCol:=NULL, with = F]
|
train[, nameLastCol:=NULL, with = F]
|
||||||
```
|
```
|
||||||
|
|
||||||
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by XGBoost. We need to convert both datasets (training and test) in numeric Matrix format.
|
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in numeric Matrix format.
|
||||||
|
|
||||||
```{r convertToNumericMatrix}
|
```{r convertToNumericMatrix}
|
||||||
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
|
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
|
||||||
@ -103,9 +107,9 @@ testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
|
|||||||
Model training
|
Model training
|
||||||
==============
|
==============
|
||||||
|
|
||||||
Before the learning we will use the cross validation to evaluate the our error rate.
|
Before the learning we will use the cross validation to evaluate the error rate.
|
||||||
|
|
||||||
Basically XGBoost will divide the training data in `nfold` parts, then XGBoost will retain the first part and use it as the test data. Then it will reintegrate the first part to the training dataset and retain the second part, do a training and so on...
|
Basically **XGBoost** will divide the training data in `nfold` parts, then **XGBoost** will retain the first part and use it as the test data. Then it will reintegrate the first part to the training dataset and retain the second part, do a training and so on...
|
||||||
|
|
||||||
Look at the function documentation for more information.
|
Look at the function documentation for more information.
|
||||||
|
|
||||||
@ -140,21 +144,21 @@ Feature importance
|
|||||||
|
|
||||||
So far, we have built a model made of **`r nround`** trees.
|
So far, we have built a model made of **`r nround`** trees.
|
||||||
|
|
||||||
To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **OTTO** products).
|
To build a *tree*, the dataset is divided recursively `max.depth` times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products).
|
||||||
|
|
||||||
Each division operation is called a *split*.
|
Each division operation is called a *split*.
|
||||||
|
|
||||||
Each group at each division level is called a branch and the deepest level is called a **leaf**.
|
Each group at each division level is called a *branch* and the deepest level is called a *leaf*.
|
||||||
|
|
||||||
In the final model, these leafs are supposed to be as pure as possible for each tree, meaning in our case that each leaf should be made of one class of **OTTO** product only (of course it is not true, but that's what we try to achieve in a minimum of splits).
|
In the final model, these leafs are supposed to be as pure as possible for each tree, meaning in our case that each leaf should be made of one class of **Otto** product only (of course it is not true, but that's what we try to achieve in a minimum of splits).
|
||||||
|
|
||||||
**Not all splits are equally important**. Basically the first split of a tree will have more impact on the purity that, for instance, the deepest split. Intuitively, we understand that the first split makes most of the work, and the following splits focus on smaller parts of the dataset which have been missclassified by the first tree.
|
**Not all splits are equally important**. Basically the first split of a tree will have more impact on the purity that, for instance, the deepest split. Intuitively, we understand that the first split makes most of the work, and the following splits focus on smaller parts of the dataset which have been missclassified by the first tree.
|
||||||
|
|
||||||
In the same way, in Boosting we try to optimize the missclassification at each round (it is called the **loss**). So the first tree will do the big work and the following trees will focus on the remaining, on the parts not correctly learned by the previous trees.
|
In the same way, in Boosting we try to optimize the missclassification at each round (it is called the *loss*). So the first tree will do most of the work and the following trees will focus on the remaining, on the parts not correctly learned by the previous trees.
|
||||||
|
|
||||||
The improvement brought by each split can be measured, it is the **gain**.
|
The improvement brought by each split can be measured, it is the *gain*.
|
||||||
|
|
||||||
Each split is done on one feature only at one value.
|
Each split is done on one feature only at one specific value.
|
||||||
|
|
||||||
Let's see what the model looks like.
|
Let's see what the model looks like.
|
||||||
|
|
||||||
@ -168,13 +172,13 @@ Clearly, it is not easy to understand what it means.
|
|||||||
|
|
||||||
Basically each line represents a branch, there is the tree ID, the feature ID, the point where it splits, and information regarding the next branches (left, right, when the row for this feature is N/A).
|
Basically each line represents a branch, there is the tree ID, the feature ID, the point where it splits, and information regarding the next branches (left, right, when the row for this feature is N/A).
|
||||||
|
|
||||||
Hopefully, XGBoost offers a better representation: **feature importance**.
|
Hopefully, **XGBoost** offers a better representation: **feature importance**.
|
||||||
|
|
||||||
Feature importance is about averaging the gain of each feature for all split and all trees.
|
Feature importance is about averaging the gain of each feature for all split and all trees.
|
||||||
|
|
||||||
Then we can use the function `xgb.plot.importance`.
|
Then we can use the function `xgb.plot.importance`.
|
||||||
|
|
||||||
```{r importanceFeature}
|
```{r importanceFeature, fig.align='center', fig.height=5, fig.width=10}
|
||||||
# Get the feature real names
|
# Get the feature real names
|
||||||
names <- dimnames(trainMatrix)[[2]]
|
names <- dimnames(trainMatrix)[[2]]
|
||||||
|
|
||||||
@ -184,7 +188,8 @@ importance_matrix <- xgb.importance(names, model = bst)
|
|||||||
# Nice graph
|
# Nice graph
|
||||||
xgb.plot.importance(importance_matrix[1:10,])
|
xgb.plot.importance(importance_matrix[1:10,])
|
||||||
```
|
```
|
||||||
> To make it understandable we first extract the column names from the `Matrix`.
|
|
||||||
|
> To make the graph understandable we first extract the column names from the `Matrix`.
|
||||||
|
|
||||||
Interpretation
|
Interpretation
|
||||||
--------------
|
--------------
|
||||||
@ -193,9 +198,9 @@ In the feature importance above, we can see the first 10 most important features
|
|||||||
|
|
||||||
This function gives a color to each bar. Basically a K-means clustering is applied to group each feature by importance.
|
This function gives a color to each bar. Basically a K-means clustering is applied to group each feature by importance.
|
||||||
|
|
||||||
From here you can take several actions. For instance you can remove the less important feature (feature selection process), or go deeper in the interaction between the most important features and labels.
|
From here you can take several actions. For instance you can remove the less important features (feature selection process), or go deeper in the interaction between the most important features and labels.
|
||||||
|
|
||||||
Or you can just reason about why these features are so importat (in OTTO challenge we can't go this way because there is not enough information).
|
Or you can try to guess why these features are so importat (in **Otto** challenge we can't go this way because there is not enough information).
|
||||||
|
|
||||||
Tree graph
|
Tree graph
|
||||||
----------
|
----------
|
||||||
@ -204,19 +209,19 @@ Feature importance gives you feature weight information but not interaction betw
|
|||||||
|
|
||||||
**XGBoost R** package have another useful function for that.
|
**XGBoost R** package have another useful function for that.
|
||||||
|
|
||||||
```{r treeGraph, dpi=300, fig.align='left'}
|
```{r treeGraph, dpi=1500, fig.align='left'}
|
||||||
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
|
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
|
||||||
```
|
```
|
||||||
|
|
||||||
We are just displaying the first two trees here.
|
We are just displaying the first two trees here.
|
||||||
|
|
||||||
On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated.
|
On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated.
|
||||||
Besides, XGBoost generate `k` trees at each round for a `k`-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.
|
Besides, **XGBoost** generate `K` trees at each round for a `K`-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.
|
||||||
|
|
||||||
Going deeper
|
Going deeper
|
||||||
============
|
============
|
||||||
|
|
||||||
There are two documents you may want to check to go deeper:
|
There are 3 documents you may be interested in:
|
||||||
|
|
||||||
* [xgboostPresentation.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd): general presentation
|
* [xgboostPresentation.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd): general presentation
|
||||||
* [discoverYourData.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/discoverYourData.Rmd): explaining feature analysus
|
* [discoverYourData.Rmd](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/discoverYourData.Rmd): explaining feature analysus
|
||||||
|
|||||||
@ -28,7 +28,7 @@ if len(lib_path) == 0:
|
|||||||
raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
|
raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
|
||||||
"../make?")
|
"../make?")
|
||||||
setup(name="xgboost",
|
setup(name="xgboost",
|
||||||
version="0.32",
|
version="0.40",
|
||||||
description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
|
description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
py_modules=['xgboost'],
|
py_modules=['xgboost'],
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
"""
|
"""
|
||||||
xgboost: eXtreme Gradient Boosting library
|
xgboost: eXtreme Gradient Boosting library
|
||||||
|
|
||||||
|
Version: 0.40
|
||||||
Authors: Tianqi Chen, Bing Xu
|
Authors: Tianqi Chen, Bing Xu
|
||||||
Early stopping by Zygmunt Zając
|
Early stopping by Zygmunt Zając
|
||||||
"""
|
"""
|
||||||
@ -31,6 +31,9 @@ except ImportError:
|
|||||||
class XGBoostLibraryNotFound(Exception):
|
class XGBoostLibraryNotFound(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class XGBoostError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
|
__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
|
||||||
|
|
||||||
if sys.version_info[0] == 3:
|
if sys.version_info[0] == 3:
|
||||||
@ -91,6 +94,14 @@ def ctypes2numpy(cptr, length, dtype):
|
|||||||
raise RuntimeError('memmove failed')
|
raise RuntimeError('memmove failed')
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def ctypes2buffer(cptr, length):
|
||||||
|
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
|
||||||
|
raise RuntimeError('expected char pointer')
|
||||||
|
res = bytearray(length)
|
||||||
|
rptr = (ctypes.c_char * length).from_buffer(res)
|
||||||
|
if not ctypes.memmove(rptr, cptr, length):
|
||||||
|
raise RuntimeError('memmove failed')
|
||||||
|
return res
|
||||||
|
|
||||||
def c_str(string):
|
def c_str(string):
|
||||||
return ctypes.c_char_p(string.encode('utf-8'))
|
return ctypes.c_char_p(string.encode('utf-8'))
|
||||||
@ -470,19 +481,26 @@ class Booster(object):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
fname : string or file handle
|
fname : string
|
||||||
Output file name or handle. If a handle is given must be a BytesIO
|
Output file name
|
||||||
object or a file opened for writing in binary format.
|
|
||||||
"""
|
"""
|
||||||
if isinstance(fname, string_types): # assume file name
|
if isinstance(fname, string_types): # assume file name
|
||||||
xglib.XGBoosterSaveModel(self.handle, c_str(fname))
|
xglib.XGBoosterSaveModel(self.handle, c_str(fname))
|
||||||
else:
|
else:
|
||||||
length = ctypes.c_ulong()
|
raise TypeError("fname must be a string")
|
||||||
cptr = xglib.XGBoosterGetModelRaw(self.handle,
|
|
||||||
ctypes.byref(length))
|
def save_raw(self):
|
||||||
address = ctypes.addressof(cptr.contents)
|
"""
|
||||||
buf = (ctypes.c_char * length.value).from_address(address)
|
Save the model to a in memory buffer represetation
|
||||||
fname.write(buf)
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
a in memory buffer represetation of the model
|
||||||
|
"""
|
||||||
|
length = ctypes.c_ulong()
|
||||||
|
cptr = xglib.XGBoosterGetModelRaw(self.handle,
|
||||||
|
ctypes.byref(length))
|
||||||
|
return ctypes2buffer(cptr, length.value)
|
||||||
|
|
||||||
def load_model(self, fname):
|
def load_model(self, fname):
|
||||||
"""
|
"""
|
||||||
@ -490,15 +508,15 @@ class Booster(object):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
fname : string of file handle
|
fname : string or a memory buffer
|
||||||
Input file name or file handle object.
|
Input file name or memory buffer(see also save_raw)
|
||||||
"""
|
"""
|
||||||
if isinstance(fname, string_types): # assume file name
|
if isinstance(fname, str): # assume file name
|
||||||
xglib.XGBoosterLoadModel(self.handle, c_str(fname))
|
xglib.XGBoosterLoadModel(self.handle, c_str(fname))
|
||||||
else:
|
else:
|
||||||
buf = fname.getbuffer()
|
buf = fname
|
||||||
length = ctypes.c_ulong(buf.nbytes)
|
length = ctypes.c_ulong(len(buf))
|
||||||
ptr = ctypes.byref(ctypes.c_void_p.from_buffer(buf))
|
ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
|
||||||
xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
|
xglib.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)
|
||||||
|
|
||||||
def dump_model(self, fo, fmap='', with_stats=False):
|
def dump_model(self, fo, fmap='', with_stats=False):
|
||||||
@ -838,7 +856,7 @@ class XGBModel(XGBModelBase):
|
|||||||
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
|
nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
|
||||||
base_score=0.5, seed=0):
|
base_score=0.5, seed=0):
|
||||||
if not SKLEARN_INSTALLED:
|
if not SKLEARN_INSTALLED:
|
||||||
raise Exception('sklearn needs to be installed in order to use this module')
|
raise XGBError('sklearn needs to be installed in order to use this module')
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
self.learning_rate = learning_rate
|
self.learning_rate = learning_rate
|
||||||
self.n_estimators = n_estimators
|
self.n_estimators = n_estimators
|
||||||
@ -855,25 +873,36 @@ class XGBModel(XGBModelBase):
|
|||||||
self.base_score = base_score
|
self.base_score = base_score
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
|
||||||
self._Booster = Booster()
|
self._Booster = None
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# can't pickle ctypes pointers so put _Booster in a BytesIO obj
|
# can't pickle ctypes pointers so put _Booster in a BytesIO obj
|
||||||
|
|
||||||
this = self.__dict__.copy() # don't modify in place
|
this = self.__dict__.copy() # don't modify in place
|
||||||
|
bst = this["_Booster"]
|
||||||
tmp = BytesIO()
|
if bst is not None:
|
||||||
this["_Booster"].save_model(tmp)
|
raw = this["_Booster"].save_raw()
|
||||||
tmp.seek(0)
|
this["_Booster"] = raw
|
||||||
this["_Booster"] = tmp
|
|
||||||
|
|
||||||
return this
|
return this
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
booster = state["_Booster"]
|
bst = state["_Booster"]
|
||||||
state["_Booster"] = Booster(model_file=booster)
|
if bst is not None:
|
||||||
|
state["_Booster"] = Booster(model_file=bst)
|
||||||
self.__dict__.update(state)
|
self.__dict__.update(state)
|
||||||
|
|
||||||
|
def booster(self):
|
||||||
|
"""
|
||||||
|
get the underlying xgboost Booster of this model
|
||||||
|
will raise an exception when fit was not called
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
booster : a xgboost booster of underlying model
|
||||||
|
"""
|
||||||
|
if self._Booster is None:
|
||||||
|
raise XGBError('need to call fit beforehand')
|
||||||
|
return self._Booster
|
||||||
|
|
||||||
def get_xgb_params(self):
|
def get_xgb_params(self):
|
||||||
xgb_params = self.get_params()
|
xgb_params = self.get_params()
|
||||||
|
|
||||||
@ -890,7 +919,7 @@ class XGBModel(XGBModelBase):
|
|||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
testDmatrix = DMatrix(X)
|
testDmatrix = DMatrix(X)
|
||||||
return self._Booster.predict(testDmatrix)
|
return self.booster().predict(testDmatrix)
|
||||||
|
|
||||||
|
|
||||||
class XGBClassifier(XGBModel, XGBClassifier):
|
class XGBClassifier(XGBModel, XGBClassifier):
|
||||||
@ -931,7 +960,7 @@ class XGBClassifier(XGBModel, XGBClassifier):
|
|||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
testDmatrix = DMatrix(X)
|
testDmatrix = DMatrix(X)
|
||||||
class_probs = self._Booster.predict(testDmatrix)
|
class_probs = self.booster().predict(testDmatrix)
|
||||||
if len(class_probs.shape) > 1:
|
if len(class_probs.shape) > 1:
|
||||||
column_indexes = np.argmax(class_probs, axis=1)
|
column_indexes = np.argmax(class_probs, axis=1)
|
||||||
else:
|
else:
|
||||||
@ -941,7 +970,7 @@ class XGBClassifier(XGBModel, XGBClassifier):
|
|||||||
|
|
||||||
def predict_proba(self, X):
|
def predict_proba(self, X):
|
||||||
testDmatrix = DMatrix(X)
|
testDmatrix = DMatrix(X)
|
||||||
class_probs = self._Booster.predict(testDmatrix)
|
class_probs = self.booster().predict(testDmatrix)
|
||||||
if self.objective == "multi:softprob":
|
if self.objective == "multi:softprob":
|
||||||
return class_probs
|
return class_probs
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -62,6 +62,7 @@ class Booster: public learner::BoostLearner {
|
|||||||
this->init_model = true;
|
this->init_model = true;
|
||||||
}
|
}
|
||||||
inline const char *GetModelRaw(bst_ulong *out_len) {
|
inline const char *GetModelRaw(bst_ulong *out_len) {
|
||||||
|
this->CheckInitModel();
|
||||||
model_str.resize(0);
|
model_str.resize(0);
|
||||||
utils::MemoryBufferStream fs(&model_str);
|
utils::MemoryBufferStream fs(&model_str);
|
||||||
learner::BoostLearner::SaveModel(fs, false);
|
learner::BoostLearner::SaveModel(fs, false);
|
||||||
@ -322,8 +323,10 @@ extern "C"{
|
|||||||
void XGBoosterLoadModel(void *handle, const char *fname) {
|
void XGBoosterLoadModel(void *handle, const char *fname) {
|
||||||
static_cast<Booster*>(handle)->LoadModel(fname);
|
static_cast<Booster*>(handle)->LoadModel(fname);
|
||||||
}
|
}
|
||||||
void XGBoosterSaveModel(const void *handle, const char *fname) {
|
void XGBoosterSaveModel(void *handle, const char *fname) {
|
||||||
static_cast<const Booster*>(handle)->SaveModel(fname, false);
|
Booster *bst = static_cast<Booster*>(handle);
|
||||||
|
bst->CheckInitModel();
|
||||||
|
bst->SaveModel(fname, false);
|
||||||
}
|
}
|
||||||
void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
|
void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
|
||||||
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
|
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
|
||||||
|
|||||||
@ -203,7 +203,7 @@ extern "C" {
|
|||||||
* \param handle handle
|
* \param handle handle
|
||||||
* \param fname file name
|
* \param fname file name
|
||||||
*/
|
*/
|
||||||
XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
|
XGB_DLL void XGBoosterSaveModel(void *handle, const char *fname);
|
||||||
/*!
|
/*!
|
||||||
* \brief load model from in memory buffer
|
* \brief load model from in memory buffer
|
||||||
* \param handle handle
|
* \param handle handle
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user