[R] address some lintr warnings (#8609)

This commit is contained in:
James Lamb 2022-12-17 04:36:14 -06:00 committed by GitHub
parent 53e6e32718
commit 17ce1f26c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 137 additions and 116 deletions

View File

@ -611,7 +611,7 @@ cb.cv.predict <- function(save_models = FALSE) {
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l') #' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
#' #'
#' @export #' @export
cb.gblinear.history <- function(sparse=FALSE) { cb.gblinear.history <- function(sparse = FALSE) {
coefs <- NULL coefs <- NULL
init <- function(env) { init <- function(env) {

View File

@ -629,7 +629,7 @@ xgb.attributes <- function(object) {
#' @export #' @export
xgb.config <- function(object) { xgb.config <- function(object) {
handle <- xgb.get.handle(object) handle <- xgb.get.handle(object)
.Call(XGBoosterSaveJsonConfig_R, handle); .Call(XGBoosterSaveJsonConfig_R, handle)
} }
#' @rdname xgb.config #' @rdname xgb.config

View File

@ -119,10 +119,10 @@
#' print(cv, verbose=TRUE) #' print(cv, verbose=TRUE)
#' #'
#' @export #' @export
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA, xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
prediction = FALSE, showsd = TRUE, metrics=list(), prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL, obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
verbose = TRUE, print_every_n=1L, verbose = TRUE, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
check.deprecation(...) check.deprecation(...)

View File

@ -38,7 +38,7 @@
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
#' #'
#' @export #' @export
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE, xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
dump_format = c("text", "json"), ...) { dump_format = c("text", "json"), ...) {
check.deprecation(...) check.deprecation(...)
dump_format <- match.arg(dump_format) dump_format <- match.arg(dump_format)

View File

@ -34,7 +34,7 @@ df[, ID := NULL]
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters. # Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random") fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
# train a xgbTree model using caret::train # train a xgbTree model using caret::train
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl) model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model # Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
# using xgbLinear # using xgbLinear

View File

@ -404,7 +404,7 @@ test_that("Configuration works", {
config <- xgb.config(bst) config <- xgb.config(bst)
xgb.config(bst) <- config xgb.config(bst) <- config
reloaded_config <- xgb.config(bst) reloaded_config <- xgb.config(bst)
expect_equal(config, reloaded_config); expect_equal(config, reloaded_config)
}) })
test_that("strict_shape works", { test_that("strict_shape works", {

View File

@ -53,7 +53,7 @@ test_that("xgb.DMatrix: saving, loading", {
dtrain <- xgb.DMatrix(tmp_file) dtrain <- xgb.DMatrix(tmp_file)
expect_equal(colnames(dtrain), cnames) expect_equal(colnames(dtrain), cnames)
ft <- rep(c("c", "q"), each=length(cnames)/2) ft <- rep(c("c", "q"), each = length(cnames) / 2)
setinfo(dtrain, "feature_type", ft) setinfo(dtrain, "feature_type", ft)
expect_equal(ft, getinfo(dtrain, "feature_type")) expect_equal(ft, getinfo(dtrain, "feature_type"))
}) })

View File

@ -440,7 +440,7 @@ test_that("xgb.plot.shap.summary works", {
}) })
test_that("check.deprecation works", { test_that("check.deprecation works", {
ttt <- function(a = NNULL, DUMMY=NULL, ...) { ttt <- function(a = NNULL, DUMMY = NULL, ...) {
check.deprecation(...) check.deprecation(...)
as.list((environment())) as.list((environment()))
} }

View File

@ -28,7 +28,9 @@ Package loading:
require(xgboost) require(xgboost)
require(Matrix) require(Matrix)
require(data.table) require(data.table)
if (!require('vcd')) install.packages('vcd') if (!require('vcd')) {
install.packages('vcd')
}
``` ```
> **VCD** package is used for one of its embedded dataset only. > **VCD** package is used for one of its embedded dataset only.
@ -100,7 +102,7 @@ Note that we transform it to `factor` so the algorithm treat these age groups as
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation. Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
```{r} ```{r}
head(df[,AgeDiscret := as.factor(round(Age/10,0))]) head(df[, AgeDiscret := as.factor(round(Age / 10, 0))])
``` ```
##### Random split into two groups ##### Random split into two groups
@ -108,7 +110,7 @@ head(df[,AgeDiscret := as.factor(round(Age/10,0))])
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...). Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
```{r} ```{r}
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]) head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))])
``` ```
##### Risks in adding correlated features ##### Risks in adding correlated features
@ -124,13 +126,13 @@ Fortunately, decision tree algorithms (including boosted trees) are very robust
We remove ID as there is nothing to learn from this feature (it would just add some noise). We remove ID as there is nothing to learn from this feature (it would just add some noise).
```{r, results='hide'} ```{r, results='hide'}
df[,ID:=NULL] df[, ID := NULL]
``` ```
We will list the different values for the column `Treatment`: We will list the different values for the column `Treatment`:
```{r} ```{r}
levels(df[,Treatment]) levels(df[, Treatment])
``` ```
@ -147,7 +149,7 @@ For example, the column `Treatment` will be replaced by two columns, `TreatmentP
Column `Improved` is excluded because it will be our `label` column, the one we want to predict. Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
```{r, warning=FALSE,message=FALSE} ```{r, warning=FALSE,message=FALSE}
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1] sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1]
head(sparse_matrix) head(sparse_matrix)
``` ```
@ -156,7 +158,7 @@ head(sparse_matrix)
Create the output `numeric` vector (not as a sparse `Matrix`): Create the output `numeric` vector (not as a sparse `Matrix`):
```{r} ```{r}
output_vector <- df[,Improved] == "Marked" output_vector <- df[, Improved] == "Marked"
``` ```
1. set `Y` vector to `0`; 1. set `Y` vector to `0`;
@ -170,7 +172,7 @@ The code below is very usual. For more information, you can look at the document
```{r} ```{r}
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
``` ```
@ -219,7 +221,7 @@ For that purpose we will execute the same function as above but using two more p
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector) importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display # Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] importanceClean <- importanceRaw[, `:=`(Cover = NULL, Frequency = NULL)]
head(importanceClean) head(importanceClean)
``` ```
@ -321,8 +323,8 @@ If you want to try Random Forests algorithm, you can tweak XGBoost parameters!
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns: For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
```{r, warning=FALSE, message=FALSE} ```{r, warning=FALSE, message=FALSE}
data(agaricus.train, package='xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package = 'xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test

View File

@ -52,9 +52,9 @@ It has several features:
For weekly updated version (highly recommended), install from *GitHub*: For weekly updated version (highly recommended), install from *GitHub*:
```{r installGithub, eval=FALSE} ```{r installGithub, eval=FALSE}
install.packages("drat", repos="https://cran.rstudio.com") install.packages("drat", repos = "https://cran.rstudio.com")
drat:::addRepo("dmlc") drat:::addRepo("dmlc")
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") install.packages("xgboost", repos = "http://dmlc.ml/drat/", type = "source")
``` ```
> *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first. > *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first.
@ -101,8 +101,8 @@ Why *split* the dataset in two parts?
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen. In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
```{r datasetLoading, results='hold', message=F, warning=F} ```{r datasetLoading, results='hold', message=F, warning=F}
data(agaricus.train, package='xgboost') data(agaricus.train, package = 'xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package = 'xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
``` ```
@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above. For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
```{r DMatrix, message=F, warning=F} ```{r DMatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label=train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label) dtest <- xgb.DMatrix(data = test$data, label = test$label)
``` ```
### Measure learning progress with xgb.train ### Measure learning progress with xgb.train
@ -332,7 +332,7 @@ One way to measure progress in learning of a model is to provide to **XGBoost**
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name. For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
```{r watchlist, message=F, warning=F} ```{r watchlist, message=F, warning=F}
watchlist <- list(train=dtrain, test=dtest) watchlist <- list(train = dtrain, test = dtest)
bst <- xgb.train( bst <- xgb.train(
data = dtrain data = dtrain
@ -425,7 +425,7 @@ Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereaf
```{r getinfo, message=F, warning=F} ```{r getinfo, message=F, warning=F}
label <- getinfo(dtest, "label") label <- getinfo(dtest, "label")
pred <- predict(bst, dtest) pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err)) print(paste("test-error=", err))
``` ```
@ -479,7 +479,7 @@ bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data) pred2 <- predict(bst2, test$data)
# And now the test # And now the test
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
``` ```
```{r clean, include=FALSE} ```{r clean, include=FALSE}
@ -503,7 +503,7 @@ bst3 <- xgb.load(rawVec)
pred3 <- predict(bst3, test$data) pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred # pred2 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred)))) print(paste("sum(abs(pred3-pred))=", sum(abs(pred2 - pred))))
``` ```
> Again `0`? It seems that `XGBoost` works pretty well! > Again `0`? It seems that `XGBoost` works pretty well!

View File

@ -30,7 +30,7 @@ For the purpose of this tutorial we will load the xgboost, jsonlite, and float p
require(xgboost) require(xgboost)
require(jsonlite) require(jsonlite)
require(float) require(float)
options(digits=22) options(digits = 22)
``` ```
We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled. We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled.
@ -50,7 +50,7 @@ labels <- c(1, 1, 1,
0, 0, 0, 0, 0, 0,
0, 0, 0) 0, 0, 0)
data <- data.frame(dates = dates, labels=labels) data <- data.frame(dates = dates, labels = labels)
bst <- xgboost( bst <- xgboost(
data = as.matrix(data$dates), data = as.matrix(data$dates),
@ -69,7 +69,7 @@ We will now dump the model to JSON and attempt to illustrate a variety of issues
First let's dump the model to JSON: First let's dump the model to JSON:
```{r} ```{r}
bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format='json') bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format = 'json')
bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE) bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE)
node <- bst_from_json[[1]] node <- bst_from_json[[1]]
cat(bst_json) cat(bst_json)
@ -78,10 +78,10 @@ cat(bst_json)
The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated. The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated.
```{r} ```{r}
bst_preds_logodds <- predict(bst,as.matrix(data$dates), outputmargin = TRUE) bst_preds_logodds <- predict(bst, as.matrix(data$dates), outputmargin = TRUE)
# calculate the logodds values using the JSON representation # calculate the logodds values using the JSON representation
bst_from_json_logodds <- ifelse(data$dates<node$split_condition, bst_from_json_logodds <- ifelse(data$dates < node$split_condition,
node$children[[1]]$leaf, node$children[[1]]$leaf,
node$children[[2]]$leaf) node$children[[2]]$leaf)
@ -106,19 +106,19 @@ At this stage two things happened:
To explain this, let's repeat the comparison and round to two decimals: To explain this, let's repeat the comparison and round to two decimals:
```{r} ```{r}
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2) round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
``` ```
If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree: If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree:
```{r} ```{r}
# now convert the dates to floats first # now convert the dates to floats first
bst_from_json_logodds <- ifelse(fl(data$dates)<node$split_condition, bst_from_json_logodds <- ifelse(fl(data$dates) < node$split_condition,
node$children[[1]]$leaf, node$children[[1]]$leaf,
node$children[[2]]$leaf) node$children[[2]]$leaf)
# test that values are equal # test that values are equal
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2) round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
``` ```
What's the lesson? If we are going to work with an imported JSON model, any data must be converted to floats first. In this case, since '20180131' cannot be represented as a 32-bit float, it is rounded up to 20180132, as shown here: What's the lesson? If we are going to work with an imported JSON model, any data must be converted to floats first. In this case, since '20180131' cannot be represented as a 32-bit float, it is rounded up to 20180132, as shown here:
@ -143,7 +143,7 @@ None are exactly equal. What happened? Although we've converted the data to 32
```{r} ```{r}
# now convert the dates to floats first # now convert the dates to floats first
bst_from_json_logodds <- ifelse(fl(data$dates)<fl(node$split_condition), bst_from_json_logodds <- ifelse(fl(data$dates) < fl(node$split_condition),
as.numeric(fl(node$children[[1]]$leaf)), as.numeric(fl(node$children[[1]]$leaf)),
as.numeric(fl(node$children[[2]]$leaf))) as.numeric(fl(node$children[[2]]$leaf)))
@ -160,12 +160,13 @@ We were able to get the log-odds to agree, so now let's manually calculate the s
```{r} ```{r}
bst_preds <- predict(bst,as.matrix(data$dates)) bst_preds <- predict(bst, as.matrix(data$dates))
# calculate the predictions casting doubles to floats # calculate the predictions casting doubles to floats
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition), bst_from_json_preds <- ifelse(
as.numeric(1/(1+exp(-1*fl(node$children[[1]]$leaf)))), fl(data$dates) < fl(node$split_condition)
as.numeric(1/(1+exp(-1*fl(node$children[[2]]$leaf)))) , as.numeric(1 / (1 + exp(-1 * fl(node$children[[1]]$leaf))))
, as.numeric(1 / (1 + exp(-1 * fl(node$children[[2]]$leaf))))
) )
# test that values are equal # test that values are equal
@ -177,9 +178,10 @@ None are exactly equal again. What is going on here? Well, since we are using
How do we fix this? We have to ensure we use the correct data types everywhere and the correct operators. If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied. How do we fix this? We have to ensure we use the correct data types everywhere and the correct operators. If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
```{r} ```{r}
# calculate the predictions casting doubles to floats # calculate the predictions casting doubles to floats
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition), bst_from_json_preds <- ifelse(
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[1]]$leaf)))), fl(data$dates) < fl(node$split_condition)
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[2]]$leaf)))) , as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[1]]$leaf))))
, as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[2]]$leaf))))
) )
# test that values are equal # test that values are equal

View File

@ -1,8 +1,10 @@
site <- 'http://cran.r-project.org' site <- 'http://cran.r-project.org'
if (!require('dummies')) if (!require('dummies')) {
install.packages('dummies', repos=site) install.packages('dummies', repos = site)
if (!require('insuranceData')) }
install.packages('insuranceData', repos=site) if (!require('insuranceData')) {
install.packages('insuranceData', repos = site)
}
library(dummies) library(dummies)
library(insuranceData) library(insuranceData)
@ -14,5 +16,16 @@ data$STATE <- as.factor(data$STATE)
data$CLASS <- as.factor(data$CLASS) data$CLASS <- as.factor(data$CLASS)
data$GENDER <- as.factor(data$GENDER) data$GENDER <- as.factor(data$GENDER)
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE); data.dummy <- dummy.data.frame(
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F) data
, dummy.class = 'factor'
, omit.constants = TRUE
)
write.table(
data.dummy
, 'autoclaims.csv'
, sep = ','
, row.names = FALSE
, col.names = FALSE
, quote = FALSE
)

View File

@ -4,21 +4,21 @@ require(methods)
modelfile <- "higgs.model" modelfile <- "higgs.model"
outfile <- "higgs.pred.csv" outfile <- "higgs.pred.csv"
dtest <- read.csv("data/test.csv", header=TRUE) dtest <- read.csv("data/test.csv", header = TRUE)
data <- as.matrix(dtest[2:31]) data <- as.matrix(dtest[2:31])
idx <- dtest[[1]] idx <- dtest[[1]]
xgmat <- xgb.DMatrix(data, missing = -999.0) xgmat <- xgb.DMatrix(data, missing = -999.0)
bst <- xgb.load(modelfile=modelfile) bst <- xgb.load(modelfile = modelfile)
ypred <- predict(bst, xgmat) ypred <- predict(bst, xgmat)
rorder <- rank(ypred, ties.method="first") rorder <- rank(ypred, ties.method = "first")
threshold <- 0.15 threshold <- 0.15
# to be completed # to be completed
ntop <- length(rorder) - as.integer(threshold*length(rorder)) ntop <- length(rorder) - as.integer(threshold * length(rorder))
plabel <- ifelse(rorder > ntop, "s", "b") plabel <- ifelse(rorder > ntop, "s", "b")
outdata <- list("EventId" = idx, outdata <- list("EventId" = idx,
"RankOrder" = rorder, "RankOrder" = rorder,
"Class" = plabel) "Class" = plabel)
write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE) write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE)

View File

@ -4,14 +4,14 @@ require(methods)
testsize <- 550000 testsize <- 550000
dtrain <- read.csv("data/training.csv", header=TRUE) dtrain <- read.csv("data/training.csv", header = TRUE)
dtrain[33] <- dtrain[33] == "s" dtrain[33] <- dtrain[33] == "s"
label <- as.numeric(dtrain[[33]]) label <- as.numeric(dtrain[[33]])
data <- as.matrix(dtrain[2:31]) data <- as.matrix(dtrain[2:31])
weight <- as.numeric(dtrain[[32]]) * testsize / length(label) weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
sumwpos <- sum(weight * (label==1.0)) sumwpos <- sum(weight * (label == 1.0))
sumwneg <- sum(weight * (label==0.0)) sumwneg <- sum(weight * (label == 0.0))
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0) xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw",
watchlist <- list("train" = xgmat) watchlist <- list("train" = xgmat)
nrounds <- 120 nrounds <- 120
print ("loading data end, start to boost trees") print ("loading data end, start to boost trees")
bst <- xgb.train(param, xgmat, nrounds, watchlist ); bst <- xgb.train(param, xgmat, nrounds, watchlist)
# save out model # save out model
xgb.save(bst, "higgs.model") xgb.save(bst, "higgs.model")
print ('finish training') print ('finish training')

View File

@ -5,8 +5,8 @@ require(methods)
testsize <- 550000 testsize <- 550000
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001) dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001)
dtrain$Label <- as.numeric(dtrain$Label=='s') dtrain$Label <- as.numeric(dtrain$Label == 's')
# gbm.time = system.time({ # gbm.time = system.time({
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, # gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1, # interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s')
data <- as.matrix(dtrain[2:31]) data <- as.matrix(dtrain[2:31])
weight <- as.numeric(dtrain[[32]]) * testsize / length(label) weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
sumwpos <- sum(weight * (label==1.0)) sumwpos <- sum(weight * (label == 1.0))
sumwneg <- sum(weight * (label==0.0)) sumwneg <- sum(weight * (label == 0.0))
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
xgboost.time <- list() xgboost.time <- list()
threads <- c(1,2,4,8,16) threads <- c(1, 2, 4, 8, 16)
for (i in 1:length(threads)){ for (i in 1:length(threads)){
thread <- threads[i] thread <- threads[i]
xgboost.time[[i]] <- system.time({ xgboost.time[[i]] <- system.time({
@ -40,7 +40,7 @@ for (i in 1:length(threads)){
watchlist <- list("train" = xgmat) watchlist <- list("train" = xgmat)
nrounds <- 120 nrounds <- 120
print ("loading data end, start to boost trees") print ("loading data end, start to boost trees")
bst <- xgb.train(param, xgmat, nrounds, watchlist ); bst <- xgb.train(param, xgmat, nrounds, watchlist)
# save out model # save out model
xgb.save(bst, "higgs.model") xgb.save(bst, "higgs.model")
print ('finish training') print ('finish training')
@ -67,4 +67,3 @@ xgboost.time
# [[5]] # [[5]]
# user system elapsed # user system elapsed
# 157.390 5.988 40.949 # 157.390 5.988 40.949

View File

@ -1,20 +1,20 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE) train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE) test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
train <- train[,-1] train <- train[, -1]
test <- test[,-1] test <- test[, -1]
y <- train[,ncol(train)] y <- train[, ncol(train)]
y <- gsub('Class_','',y) y <- gsub('Class_', '', y)
y <- as.integer(y)-1 # xgboost take features in [0,numOfClass) y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass)
x <- rbind(train[,-ncol(train)],test) x <- rbind(train[, -ncol(train)], test)
x <- as.matrix(x) x <- as.matrix(x)
x <- matrix(as.numeric(x),nrow(x),ncol(x)) x <- matrix(as.numeric(x), nrow(x), ncol(x))
trind <- 1:length(y) trind <- 1:length(y)
teind <- (nrow(train)+1):nrow(x) teind <- (nrow(train) + 1):nrow(x)
# Set necessary parameter # Set necessary parameter
param <- list("objective" = "multi:softprob", param <- list("objective" = "multi:softprob",
@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",
# Run Cross Validation # Run Cross Validation
cv.nrounds <- 50 cv.nrounds <- 50
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y, bst.cv <- xgb.cv(
nfold = 3, nrounds=cv.nrounds) param = param
, data = x[trind, ]
, label = y
, nfold = 3
, nrounds = cv.nrounds
)
# Train the model # Train the model
nrounds <- 50 nrounds <- 50
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds) bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)
# Make prediction # Make prediction
pred <- predict(bst,x[teind,]) pred <- predict(bst, x[teind, ])
pred <- matrix(pred,9,length(pred)/9) pred <- matrix(pred, 9, length(pred) / 9)
pred <- t(pred) pred <- t(pred)
# Output submission # Output submission
pred <- format(pred, digits=2,scientific=F) # shrink the size of submission pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
pred <- data.frame(1:nrow(pred),pred) pred <- data.frame(1:nrow(pred), pred)
names(pred) <- c('id', paste0('Class_',1:9)) names(pred) <- c('id', paste0('Class_', 1:9))
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE) write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)

View File

@ -31,7 +31,7 @@ require(methods)
require(data.table) require(data.table)
require(magrittr) require(magrittr)
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE) train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE) test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
``` ```
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid. > `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
@ -42,13 +42,13 @@ Let's explore the dataset.
dim(train) dim(train)
# Training content # Training content
train[1:6,1:5, with =FALSE] train[1:6, 1:5, with = FALSE]
# Test dataset dimensions # Test dataset dimensions
dim(test) dim(test)
# Test content # Test content
test[1:6,1:5, with =FALSE] test[1:6, 1:5, with = FALSE]
``` ```
> We only display the 6 first rows and 5 first columns for convenience > We only display the 6 first rows and 5 first columns for convenience
@ -87,7 +87,7 @@ For that purpose, we will:
```{r classToIntegers} ```{r classToIntegers}
# Convert from classes to numbers # Convert from classes to numbers
y <- train[, nameLastCol, with = FALSE][[1]] %>% y <- train[, nameLastCol, with = FALSE][[1]] %>%
gsub('Class_','',.) %>% gsub('Class_', '', .) %>%
as.integer %>% as.integer %>%
subtract(., 1) subtract(., 1)
@ -98,14 +98,14 @@ y[1:5]
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels! We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
```{r deleteCols, results='hide'} ```{r deleteCols, results='hide'}
train[, nameLastCol:=NULL, with = FALSE] train[, nameLastCol := NULL, with = FALSE]
``` ```
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format. `data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
```{r convertToNumericMatrix} ```{r convertToNumericMatrix}
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
``` ```
Model training Model training
@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
cv.nrounds <- 5 cv.nrounds <- 5
cv.nfold <- 3 cv.nfold <- 3
bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y, bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
nfold = cv.nfold, nrounds = cv.nrounds) nfold = cv.nfold, nrounds = cv.nrounds)
``` ```
> As we can see the error rate is low on the test dataset (for a 5mn trained model). > As we can see the error rate is low on the test dataset (for a 5mn trained model).
@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!
```{r modelTraining} ```{r modelTraining}
nrounds <- 50 nrounds <- 50
bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds) bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
``` ```
Model understanding Model understanding
@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
importance_matrix <- xgb.importance(names, model = bst) importance_matrix <- xgb.importance(names, model = bst)
# Nice graph # Nice graph
xgb.plot.importance(importance_matrix[1:10,]) xgb.plot.importance(importance_matrix[1:10, ])
``` ```
> To make it understandable we first extract the column names from the `Matrix`. > To make it understandable we first extract the column names from the `Matrix`.

View File

@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE) idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
train <- df[idx,] train <- df[idx, ]
test <- df[-idx,] test <- df[-idx, ]
train_x <- train[, 1:34] train_x <- train[, 1:34]
train_y <- train[, V35] train_y <- train[, V35]