[R] address some lintr warnings (#8609)
This commit is contained in:
parent
53e6e32718
commit
17ce1f26c8
@ -611,7 +611,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
|||||||
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
|
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
cb.gblinear.history <- function(sparse=FALSE) {
|
cb.gblinear.history <- function(sparse = FALSE) {
|
||||||
coefs <- NULL
|
coefs <- NULL
|
||||||
|
|
||||||
init <- function(env) {
|
init <- function(env) {
|
||||||
|
|||||||
@ -629,7 +629,7 @@ xgb.attributes <- function(object) {
|
|||||||
#' @export
|
#' @export
|
||||||
xgb.config <- function(object) {
|
xgb.config <- function(object) {
|
||||||
handle <- xgb.get.handle(object)
|
handle <- xgb.get.handle(object)
|
||||||
.Call(XGBoosterSaveJsonConfig_R, handle);
|
.Call(XGBoosterSaveJsonConfig_R, handle)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' @rdname xgb.config
|
#' @rdname xgb.config
|
||||||
|
|||||||
@ -119,10 +119,10 @@
|
|||||||
#' print(cv, verbose=TRUE)
|
#' print(cv, verbose=TRUE)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
|
xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||||
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
|
||||||
verbose = TRUE, print_every_n=1L,
|
verbose = TRUE, print_every_n = 1L,
|
||||||
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
|
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
|
||||||
|
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
|
|||||||
@ -38,7 +38,7 @@
|
|||||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
|
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
|
||||||
dump_format = c("text", "json"), ...) {
|
dump_format = c("text", "json"), ...) {
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
dump_format <- match.arg(dump_format)
|
dump_format <- match.arg(dump_format)
|
||||||
|
|||||||
@ -34,7 +34,7 @@ df[, ID := NULL]
|
|||||||
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
||||||
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
|
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
|
||||||
# train a xgbTree model using caret::train
|
# train a xgbTree model using caret::train
|
||||||
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
|
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
|
||||||
|
|
||||||
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
|
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
|
||||||
# using xgbLinear
|
# using xgbLinear
|
||||||
|
|||||||
@ -404,7 +404,7 @@ test_that("Configuration works", {
|
|||||||
config <- xgb.config(bst)
|
config <- xgb.config(bst)
|
||||||
xgb.config(bst) <- config
|
xgb.config(bst) <- config
|
||||||
reloaded_config <- xgb.config(bst)
|
reloaded_config <- xgb.config(bst)
|
||||||
expect_equal(config, reloaded_config);
|
expect_equal(config, reloaded_config)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("strict_shape works", {
|
test_that("strict_shape works", {
|
||||||
|
|||||||
@ -53,7 +53,7 @@ test_that("xgb.DMatrix: saving, loading", {
|
|||||||
dtrain <- xgb.DMatrix(tmp_file)
|
dtrain <- xgb.DMatrix(tmp_file)
|
||||||
expect_equal(colnames(dtrain), cnames)
|
expect_equal(colnames(dtrain), cnames)
|
||||||
|
|
||||||
ft <- rep(c("c", "q"), each=length(cnames)/2)
|
ft <- rep(c("c", "q"), each = length(cnames) / 2)
|
||||||
setinfo(dtrain, "feature_type", ft)
|
setinfo(dtrain, "feature_type", ft)
|
||||||
expect_equal(ft, getinfo(dtrain, "feature_type"))
|
expect_equal(ft, getinfo(dtrain, "feature_type"))
|
||||||
})
|
})
|
||||||
|
|||||||
@ -440,7 +440,7 @@ test_that("xgb.plot.shap.summary works", {
|
|||||||
})
|
})
|
||||||
|
|
||||||
test_that("check.deprecation works", {
|
test_that("check.deprecation works", {
|
||||||
ttt <- function(a = NNULL, DUMMY=NULL, ...) {
|
ttt <- function(a = NNULL, DUMMY = NULL, ...) {
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
as.list((environment()))
|
as.list((environment()))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,7 +28,9 @@ Package loading:
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(Matrix)
|
require(Matrix)
|
||||||
require(data.table)
|
require(data.table)
|
||||||
if (!require('vcd')) install.packages('vcd')
|
if (!require('vcd')) {
|
||||||
|
install.packages('vcd')
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
> **VCD** package is used for one of its embedded dataset only.
|
> **VCD** package is used for one of its embedded dataset only.
|
||||||
@ -100,7 +102,7 @@ Note that we transform it to `factor` so the algorithm treat these age groups as
|
|||||||
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
|
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
|
head(df[, AgeDiscret := as.factor(round(Age / 10, 0))])
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Random split into two groups
|
##### Random split into two groups
|
||||||
@ -108,7 +110,7 @@ head(df[,AgeDiscret := as.factor(round(Age/10,0))])
|
|||||||
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
|
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
|
head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))])
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Risks in adding correlated features
|
##### Risks in adding correlated features
|
||||||
@ -124,13 +126,13 @@ Fortunately, decision tree algorithms (including boosted trees) are very robust
|
|||||||
We remove ID as there is nothing to learn from this feature (it would just add some noise).
|
We remove ID as there is nothing to learn from this feature (it would just add some noise).
|
||||||
|
|
||||||
```{r, results='hide'}
|
```{r, results='hide'}
|
||||||
df[,ID:=NULL]
|
df[, ID := NULL]
|
||||||
```
|
```
|
||||||
|
|
||||||
We will list the different values for the column `Treatment`:
|
We will list the different values for the column `Treatment`:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
levels(df[,Treatment])
|
levels(df[, Treatment])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -147,7 +149,7 @@ For example, the column `Treatment` will be replaced by two columns, `TreatmentP
|
|||||||
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
|
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
|
||||||
|
|
||||||
```{r, warning=FALSE,message=FALSE}
|
```{r, warning=FALSE,message=FALSE}
|
||||||
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
|
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1]
|
||||||
head(sparse_matrix)
|
head(sparse_matrix)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -156,7 +158,7 @@ head(sparse_matrix)
|
|||||||
Create the output `numeric` vector (not as a sparse `Matrix`):
|
Create the output `numeric` vector (not as a sparse `Matrix`):
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
output_vector <- df[,Improved] == "Marked"
|
output_vector <- df[, Improved] == "Marked"
|
||||||
```
|
```
|
||||||
|
|
||||||
1. set `Y` vector to `0`;
|
1. set `Y` vector to `0`;
|
||||||
@ -170,7 +172,7 @@ The code below is very usual. For more information, you can look at the document
|
|||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
||||||
eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -219,7 +221,7 @@ For that purpose we will execute the same function as above but using two more p
|
|||||||
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
|
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
|
||||||
|
|
||||||
# Cleaning for better display
|
# Cleaning for better display
|
||||||
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
|
importanceClean <- importanceRaw[, `:=`(Cover = NULL, Frequency = NULL)]
|
||||||
|
|
||||||
head(importanceClean)
|
head(importanceClean)
|
||||||
```
|
```
|
||||||
@ -321,8 +323,8 @@ If you want to try Random Forests algorithm, you can tweak XGBoost parameters!
|
|||||||
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
|
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
|
||||||
|
|
||||||
```{r, warning=FALSE, message=FALSE}
|
```{r, warning=FALSE, message=FALSE}
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package = 'xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package = 'xgboost')
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
test <- agaricus.test
|
test <- agaricus.test
|
||||||
|
|
||||||
|
|||||||
@ -52,9 +52,9 @@ It has several features:
|
|||||||
For weekly updated version (highly recommended), install from *GitHub*:
|
For weekly updated version (highly recommended), install from *GitHub*:
|
||||||
|
|
||||||
```{r installGithub, eval=FALSE}
|
```{r installGithub, eval=FALSE}
|
||||||
install.packages("drat", repos="https://cran.rstudio.com")
|
install.packages("drat", repos = "https://cran.rstudio.com")
|
||||||
drat:::addRepo("dmlc")
|
drat:::addRepo("dmlc")
|
||||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
install.packages("xgboost", repos = "http://dmlc.ml/drat/", type = "source")
|
||||||
```
|
```
|
||||||
|
|
||||||
> *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first.
|
> *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first.
|
||||||
@ -101,8 +101,8 @@ Why *split* the dataset in two parts?
|
|||||||
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
|
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
|
||||||
|
|
||||||
```{r datasetLoading, results='hold', message=F, warning=F}
|
```{r datasetLoading, results='hold', message=F, warning=F}
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package = 'xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package = 'xgboost')
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
test <- agaricus.test
|
test <- agaricus.test
|
||||||
```
|
```
|
||||||
@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
|
|||||||
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
|
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
|
||||||
|
|
||||||
```{r DMatrix, message=F, warning=F}
|
```{r DMatrix, message=F, warning=F}
|
||||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
dtest <- xgb.DMatrix(data = test$data, label = test$label)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Measure learning progress with xgb.train
|
### Measure learning progress with xgb.train
|
||||||
@ -332,7 +332,7 @@ One way to measure progress in learning of a model is to provide to **XGBoost**
|
|||||||
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
|
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
|
||||||
|
|
||||||
```{r watchlist, message=F, warning=F}
|
```{r watchlist, message=F, warning=F}
|
||||||
watchlist <- list(train=dtrain, test=dtest)
|
watchlist <- list(train = dtrain, test = dtest)
|
||||||
|
|
||||||
bst <- xgb.train(
|
bst <- xgb.train(
|
||||||
data = dtrain
|
data = dtrain
|
||||||
@ -425,7 +425,7 @@ Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereaf
|
|||||||
```{r getinfo, message=F, warning=F}
|
```{r getinfo, message=F, warning=F}
|
||||||
label <- getinfo(dtest, "label")
|
label <- getinfo(dtest, "label")
|
||||||
pred <- predict(bst, dtest)
|
pred <- predict(bst, dtest)
|
||||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
|
||||||
print(paste("test-error=", err))
|
print(paste("test-error=", err))
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -479,7 +479,7 @@ bst2 <- xgb.load("xgboost.model")
|
|||||||
pred2 <- predict(bst2, test$data)
|
pred2 <- predict(bst2, test$data)
|
||||||
|
|
||||||
# And now the test
|
# And now the test
|
||||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r clean, include=FALSE}
|
```{r clean, include=FALSE}
|
||||||
@ -503,7 +503,7 @@ bst3 <- xgb.load(rawVec)
|
|||||||
pred3 <- predict(bst3, test$data)
|
pred3 <- predict(bst3, test$data)
|
||||||
|
|
||||||
# pred2 should be identical to pred
|
# pred2 should be identical to pred
|
||||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2 - pred))))
|
||||||
```
|
```
|
||||||
|
|
||||||
> Again `0`? It seems that `XGBoost` works pretty well!
|
> Again `0`? It seems that `XGBoost` works pretty well!
|
||||||
|
|||||||
@ -30,7 +30,7 @@ For the purpose of this tutorial we will load the xgboost, jsonlite, and float p
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(jsonlite)
|
require(jsonlite)
|
||||||
require(float)
|
require(float)
|
||||||
options(digits=22)
|
options(digits = 22)
|
||||||
```
|
```
|
||||||
|
|
||||||
We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled.
|
We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled.
|
||||||
@ -50,7 +50,7 @@ labels <- c(1, 1, 1,
|
|||||||
0, 0, 0,
|
0, 0, 0,
|
||||||
0, 0, 0)
|
0, 0, 0)
|
||||||
|
|
||||||
data <- data.frame(dates = dates, labels=labels)
|
data <- data.frame(dates = dates, labels = labels)
|
||||||
|
|
||||||
bst <- xgboost(
|
bst <- xgboost(
|
||||||
data = as.matrix(data$dates),
|
data = as.matrix(data$dates),
|
||||||
@ -69,7 +69,7 @@ We will now dump the model to JSON and attempt to illustrate a variety of issues
|
|||||||
First let's dump the model to JSON:
|
First let's dump the model to JSON:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format='json')
|
bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format = 'json')
|
||||||
bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE)
|
bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE)
|
||||||
node <- bst_from_json[[1]]
|
node <- bst_from_json[[1]]
|
||||||
cat(bst_json)
|
cat(bst_json)
|
||||||
@ -78,10 +78,10 @@ cat(bst_json)
|
|||||||
The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated.
|
The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
bst_preds_logodds <- predict(bst,as.matrix(data$dates), outputmargin = TRUE)
|
bst_preds_logodds <- predict(bst, as.matrix(data$dates), outputmargin = TRUE)
|
||||||
|
|
||||||
# calculate the logodds values using the JSON representation
|
# calculate the logodds values using the JSON representation
|
||||||
bst_from_json_logodds <- ifelse(data$dates<node$split_condition,
|
bst_from_json_logodds <- ifelse(data$dates < node$split_condition,
|
||||||
node$children[[1]]$leaf,
|
node$children[[1]]$leaf,
|
||||||
node$children[[2]]$leaf)
|
node$children[[2]]$leaf)
|
||||||
|
|
||||||
@ -106,19 +106,19 @@ At this stage two things happened:
|
|||||||
To explain this, let's repeat the comparison and round to two decimals:
|
To explain this, let's repeat the comparison and round to two decimals:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2)
|
round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
|
||||||
```
|
```
|
||||||
|
|
||||||
If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree:
|
If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
# now convert the dates to floats first
|
# now convert the dates to floats first
|
||||||
bst_from_json_logodds <- ifelse(fl(data$dates)<node$split_condition,
|
bst_from_json_logodds <- ifelse(fl(data$dates) < node$split_condition,
|
||||||
node$children[[1]]$leaf,
|
node$children[[1]]$leaf,
|
||||||
node$children[[2]]$leaf)
|
node$children[[2]]$leaf)
|
||||||
|
|
||||||
# test that values are equal
|
# test that values are equal
|
||||||
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2)
|
round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
|
||||||
```
|
```
|
||||||
|
|
||||||
What's the lesson? If we are going to work with an imported JSON model, any data must be converted to floats first. In this case, since '20180131' cannot be represented as a 32-bit float, it is rounded up to 20180132, as shown here:
|
What's the lesson? If we are going to work with an imported JSON model, any data must be converted to floats first. In this case, since '20180131' cannot be represented as a 32-bit float, it is rounded up to 20180132, as shown here:
|
||||||
@ -143,7 +143,7 @@ None are exactly equal. What happened? Although we've converted the data to 32
|
|||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
# now convert the dates to floats first
|
# now convert the dates to floats first
|
||||||
bst_from_json_logodds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
bst_from_json_logodds <- ifelse(fl(data$dates) < fl(node$split_condition),
|
||||||
as.numeric(fl(node$children[[1]]$leaf)),
|
as.numeric(fl(node$children[[1]]$leaf)),
|
||||||
as.numeric(fl(node$children[[2]]$leaf)))
|
as.numeric(fl(node$children[[2]]$leaf)))
|
||||||
|
|
||||||
@ -160,12 +160,13 @@ We were able to get the log-odds to agree, so now let's manually calculate the s
|
|||||||
|
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
bst_preds <- predict(bst,as.matrix(data$dates))
|
bst_preds <- predict(bst, as.matrix(data$dates))
|
||||||
|
|
||||||
# calculate the predictions casting doubles to floats
|
# calculate the predictions casting doubles to floats
|
||||||
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
bst_from_json_preds <- ifelse(
|
||||||
as.numeric(1/(1+exp(-1*fl(node$children[[1]]$leaf)))),
|
fl(data$dates) < fl(node$split_condition)
|
||||||
as.numeric(1/(1+exp(-1*fl(node$children[[2]]$leaf))))
|
, as.numeric(1 / (1 + exp(-1 * fl(node$children[[1]]$leaf))))
|
||||||
|
, as.numeric(1 / (1 + exp(-1 * fl(node$children[[2]]$leaf))))
|
||||||
)
|
)
|
||||||
|
|
||||||
# test that values are equal
|
# test that values are equal
|
||||||
@ -177,9 +178,10 @@ None are exactly equal again. What is going on here? Well, since we are using
|
|||||||
How do we fix this? We have to ensure we use the correct data types everywhere and the correct operators. If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
|
How do we fix this? We have to ensure we use the correct data types everywhere and the correct operators. If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
|
||||||
```{r}
|
```{r}
|
||||||
# calculate the predictions casting doubles to floats
|
# calculate the predictions casting doubles to floats
|
||||||
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
bst_from_json_preds <- ifelse(
|
||||||
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[1]]$leaf)))),
|
fl(data$dates) < fl(node$split_condition)
|
||||||
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[2]]$leaf))))
|
, as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[1]]$leaf))))
|
||||||
|
, as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[2]]$leaf))))
|
||||||
)
|
)
|
||||||
|
|
||||||
# test that values are equal
|
# test that values are equal
|
||||||
|
|||||||
@ -1,8 +1,10 @@
|
|||||||
site <- 'http://cran.r-project.org'
|
site <- 'http://cran.r-project.org'
|
||||||
if (!require('dummies'))
|
if (!require('dummies')) {
|
||||||
install.packages('dummies', repos=site)
|
install.packages('dummies', repos = site)
|
||||||
if (!require('insuranceData'))
|
}
|
||||||
install.packages('insuranceData', repos=site)
|
if (!require('insuranceData')) {
|
||||||
|
install.packages('insuranceData', repos = site)
|
||||||
|
}
|
||||||
|
|
||||||
library(dummies)
|
library(dummies)
|
||||||
library(insuranceData)
|
library(insuranceData)
|
||||||
@ -14,5 +16,16 @@ data$STATE <- as.factor(data$STATE)
|
|||||||
data$CLASS <- as.factor(data$CLASS)
|
data$CLASS <- as.factor(data$CLASS)
|
||||||
data$GENDER <- as.factor(data$GENDER)
|
data$GENDER <- as.factor(data$GENDER)
|
||||||
|
|
||||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
|
data.dummy <- dummy.data.frame(
|
||||||
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
data
|
||||||
|
, dummy.class = 'factor'
|
||||||
|
, omit.constants = TRUE
|
||||||
|
)
|
||||||
|
write.table(
|
||||||
|
data.dummy
|
||||||
|
, 'autoclaims.csv'
|
||||||
|
, sep = ','
|
||||||
|
, row.names = FALSE
|
||||||
|
, col.names = FALSE
|
||||||
|
, quote = FALSE
|
||||||
|
)
|
||||||
|
|||||||
@ -4,21 +4,21 @@ require(methods)
|
|||||||
|
|
||||||
modelfile <- "higgs.model"
|
modelfile <- "higgs.model"
|
||||||
outfile <- "higgs.pred.csv"
|
outfile <- "higgs.pred.csv"
|
||||||
dtest <- read.csv("data/test.csv", header=TRUE)
|
dtest <- read.csv("data/test.csv", header = TRUE)
|
||||||
data <- as.matrix(dtest[2:31])
|
data <- as.matrix(dtest[2:31])
|
||||||
idx <- dtest[[1]]
|
idx <- dtest[[1]]
|
||||||
|
|
||||||
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||||
bst <- xgb.load(modelfile=modelfile)
|
bst <- xgb.load(modelfile = modelfile)
|
||||||
ypred <- predict(bst, xgmat)
|
ypred <- predict(bst, xgmat)
|
||||||
|
|
||||||
rorder <- rank(ypred, ties.method="first")
|
rorder <- rank(ypred, ties.method = "first")
|
||||||
|
|
||||||
threshold <- 0.15
|
threshold <- 0.15
|
||||||
# to be completed
|
# to be completed
|
||||||
ntop <- length(rorder) - as.integer(threshold*length(rorder))
|
ntop <- length(rorder) - as.integer(threshold * length(rorder))
|
||||||
plabel <- ifelse(rorder > ntop, "s", "b")
|
plabel <- ifelse(rorder > ntop, "s", "b")
|
||||||
outdata <- list("EventId" = idx,
|
outdata <- list("EventId" = idx,
|
||||||
"RankOrder" = rorder,
|
"RankOrder" = rorder,
|
||||||
"Class" = plabel)
|
"Class" = plabel)
|
||||||
write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE)
|
write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE)
|
||||||
|
|||||||
@ -4,14 +4,14 @@ require(methods)
|
|||||||
|
|
||||||
testsize <- 550000
|
testsize <- 550000
|
||||||
|
|
||||||
dtrain <- read.csv("data/training.csv", header=TRUE)
|
dtrain <- read.csv("data/training.csv", header = TRUE)
|
||||||
dtrain[33] <- dtrain[33] == "s"
|
dtrain[33] <- dtrain[33] == "s"
|
||||||
label <- as.numeric(dtrain[[33]])
|
label <- as.numeric(dtrain[[33]])
|
||||||
data <- as.matrix(dtrain[2:31])
|
data <- as.matrix(dtrain[2:31])
|
||||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||||
|
|
||||||
sumwpos <- sum(weight * (label==1.0))
|
sumwpos <- sum(weight * (label == 1.0))
|
||||||
sumwneg <- sum(weight * (label==0.0))
|
sumwneg <- sum(weight * (label == 0.0))
|
||||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||||
|
|
||||||
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||||
@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw",
|
|||||||
watchlist <- list("train" = xgmat)
|
watchlist <- list("train" = xgmat)
|
||||||
nrounds <- 120
|
nrounds <- 120
|
||||||
print ("loading data end, start to boost trees")
|
print ("loading data end, start to boost trees")
|
||||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||||
# save out model
|
# save out model
|
||||||
xgb.save(bst, "higgs.model")
|
xgb.save(bst, "higgs.model")
|
||||||
print ('finish training')
|
print ('finish training')
|
||||||
|
|||||||
@ -5,8 +5,8 @@ require(methods)
|
|||||||
|
|
||||||
testsize <- 550000
|
testsize <- 550000
|
||||||
|
|
||||||
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001)
|
||||||
dtrain$Label <- as.numeric(dtrain$Label=='s')
|
dtrain$Label <- as.numeric(dtrain$Label == 's')
|
||||||
# gbm.time = system.time({
|
# gbm.time = system.time({
|
||||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||||
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
||||||
@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s')
|
|||||||
data <- as.matrix(dtrain[2:31])
|
data <- as.matrix(dtrain[2:31])
|
||||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||||
|
|
||||||
sumwpos <- sum(weight * (label==1.0))
|
sumwpos <- sum(weight * (label == 1.0))
|
||||||
sumwneg <- sum(weight * (label==0.0))
|
sumwneg <- sum(weight * (label == 0.0))
|
||||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||||
|
|
||||||
xgboost.time <- list()
|
xgboost.time <- list()
|
||||||
threads <- c(1,2,4,8,16)
|
threads <- c(1, 2, 4, 8, 16)
|
||||||
for (i in 1:length(threads)){
|
for (i in 1:length(threads)){
|
||||||
thread <- threads[i]
|
thread <- threads[i]
|
||||||
xgboost.time[[i]] <- system.time({
|
xgboost.time[[i]] <- system.time({
|
||||||
@ -40,7 +40,7 @@ for (i in 1:length(threads)){
|
|||||||
watchlist <- list("train" = xgmat)
|
watchlist <- list("train" = xgmat)
|
||||||
nrounds <- 120
|
nrounds <- 120
|
||||||
print ("loading data end, start to boost trees")
|
print ("loading data end, start to boost trees")
|
||||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||||
# save out model
|
# save out model
|
||||||
xgb.save(bst, "higgs.model")
|
xgb.save(bst, "higgs.model")
|
||||||
print ('finish training')
|
print ('finish training')
|
||||||
@ -67,4 +67,3 @@ xgboost.time
|
|||||||
# [[5]]
|
# [[5]]
|
||||||
# user system elapsed
|
# user system elapsed
|
||||||
# 157.390 5.988 40.949
|
# 157.390 5.988 40.949
|
||||||
|
|
||||||
|
|||||||
@ -1,20 +1,20 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(methods)
|
require(methods)
|
||||||
|
|
||||||
train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
|
train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||||
test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
|
test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||||
train <- train[,-1]
|
train <- train[, -1]
|
||||||
test <- test[,-1]
|
test <- test[, -1]
|
||||||
|
|
||||||
y <- train[,ncol(train)]
|
y <- train[, ncol(train)]
|
||||||
y <- gsub('Class_','',y)
|
y <- gsub('Class_', '', y)
|
||||||
y <- as.integer(y)-1 # xgboost take features in [0,numOfClass)
|
y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass)
|
||||||
|
|
||||||
x <- rbind(train[,-ncol(train)],test)
|
x <- rbind(train[, -ncol(train)], test)
|
||||||
x <- as.matrix(x)
|
x <- as.matrix(x)
|
||||||
x <- matrix(as.numeric(x),nrow(x),ncol(x))
|
x <- matrix(as.numeric(x), nrow(x), ncol(x))
|
||||||
trind <- 1:length(y)
|
trind <- 1:length(y)
|
||||||
teind <- (nrow(train)+1):nrow(x)
|
teind <- (nrow(train) + 1):nrow(x)
|
||||||
|
|
||||||
# Set necessary parameter
|
# Set necessary parameter
|
||||||
param <- list("objective" = "multi:softprob",
|
param <- list("objective" = "multi:softprob",
|
||||||
@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",
|
|||||||
|
|
||||||
# Run Cross Validation
|
# Run Cross Validation
|
||||||
cv.nrounds <- 50
|
cv.nrounds <- 50
|
||||||
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y,
|
bst.cv <- xgb.cv(
|
||||||
nfold = 3, nrounds=cv.nrounds)
|
param = param
|
||||||
|
, data = x[trind, ]
|
||||||
|
, label = y
|
||||||
|
, nfold = 3
|
||||||
|
, nrounds = cv.nrounds
|
||||||
|
)
|
||||||
|
|
||||||
# Train the model
|
# Train the model
|
||||||
nrounds <- 50
|
nrounds <- 50
|
||||||
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
|
bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)
|
||||||
|
|
||||||
# Make prediction
|
# Make prediction
|
||||||
pred <- predict(bst,x[teind,])
|
pred <- predict(bst, x[teind, ])
|
||||||
pred <- matrix(pred,9,length(pred)/9)
|
pred <- matrix(pred, 9, length(pred) / 9)
|
||||||
pred <- t(pred)
|
pred <- t(pred)
|
||||||
|
|
||||||
# Output submission
|
# Output submission
|
||||||
pred <- format(pred, digits=2,scientific=F) # shrink the size of submission
|
pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
|
||||||
pred <- data.frame(1:nrow(pred),pred)
|
pred <- data.frame(1:nrow(pred), pred)
|
||||||
names(pred) <- c('id', paste0('Class_',1:9))
|
names(pred) <- c('id', paste0('Class_', 1:9))
|
||||||
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
|
write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)
|
||||||
|
|||||||
@ -31,7 +31,7 @@ require(methods)
|
|||||||
require(data.table)
|
require(data.table)
|
||||||
require(magrittr)
|
require(magrittr)
|
||||||
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
|
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
|
||||||
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
|
test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||||
```
|
```
|
||||||
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
||||||
|
|
||||||
@ -42,13 +42,13 @@ Let's explore the dataset.
|
|||||||
dim(train)
|
dim(train)
|
||||||
|
|
||||||
# Training content
|
# Training content
|
||||||
train[1:6,1:5, with =FALSE]
|
train[1:6, 1:5, with = FALSE]
|
||||||
|
|
||||||
# Test dataset dimensions
|
# Test dataset dimensions
|
||||||
dim(test)
|
dim(test)
|
||||||
|
|
||||||
# Test content
|
# Test content
|
||||||
test[1:6,1:5, with =FALSE]
|
test[1:6, 1:5, with = FALSE]
|
||||||
```
|
```
|
||||||
> We only display the 6 first rows and 5 first columns for convenience
|
> We only display the 6 first rows and 5 first columns for convenience
|
||||||
|
|
||||||
@ -87,7 +87,7 @@ For that purpose, we will:
|
|||||||
```{r classToIntegers}
|
```{r classToIntegers}
|
||||||
# Convert from classes to numbers
|
# Convert from classes to numbers
|
||||||
y <- train[, nameLastCol, with = FALSE][[1]] %>%
|
y <- train[, nameLastCol, with = FALSE][[1]] %>%
|
||||||
gsub('Class_','',.) %>%
|
gsub('Class_', '', .) %>%
|
||||||
as.integer %>%
|
as.integer %>%
|
||||||
subtract(., 1)
|
subtract(., 1)
|
||||||
|
|
||||||
@ -98,14 +98,14 @@ y[1:5]
|
|||||||
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
||||||
|
|
||||||
```{r deleteCols, results='hide'}
|
```{r deleteCols, results='hide'}
|
||||||
train[, nameLastCol:=NULL, with = FALSE]
|
train[, nameLastCol := NULL, with = FALSE]
|
||||||
```
|
```
|
||||||
|
|
||||||
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
||||||
|
|
||||||
```{r convertToNumericMatrix}
|
```{r convertToNumericMatrix}
|
||||||
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
|
trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||||
testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
|
testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||||
```
|
```
|
||||||
|
|
||||||
Model training
|
Model training
|
||||||
@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
|
|||||||
cv.nrounds <- 5
|
cv.nrounds <- 5
|
||||||
cv.nfold <- 3
|
cv.nfold <- 3
|
||||||
|
|
||||||
bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y,
|
bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
|
||||||
nfold = cv.nfold, nrounds = cv.nrounds)
|
nfold = cv.nfold, nrounds = cv.nrounds)
|
||||||
```
|
```
|
||||||
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
|
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
|
||||||
@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!
|
|||||||
|
|
||||||
```{r modelTraining}
|
```{r modelTraining}
|
||||||
nrounds <- 50
|
nrounds <- 50
|
||||||
bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
|
bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
|
||||||
```
|
```
|
||||||
|
|
||||||
Model understanding
|
Model understanding
|
||||||
@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
|
|||||||
importance_matrix <- xgb.importance(names, model = bst)
|
importance_matrix <- xgb.importance(names, model = bst)
|
||||||
|
|
||||||
# Nice graph
|
# Nice graph
|
||||||
xgb.plot.importance(importance_matrix[1:10,])
|
xgb.plot.importance(importance_matrix[1:10, ])
|
||||||
```
|
```
|
||||||
|
|
||||||
> To make it understandable we first extract the column names from the `Matrix`.
|
> To make it understandable we first extract the column names from the `Matrix`.
|
||||||
|
|||||||
@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),
|
|||||||
|
|
||||||
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
|
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
|
||||||
|
|
||||||
train <- df[idx,]
|
train <- df[idx, ]
|
||||||
test <- df[-idx,]
|
test <- df[-idx, ]
|
||||||
|
|
||||||
train_x <- train[, 1:34]
|
train_x <- train[, 1:34]
|
||||||
train_y <- train[, V35]
|
train_y <- train[, V35]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user