From 17ce1f26c86596a3987f92199b23ddd517ea3815 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Dec 2022 04:36:14 -0600 Subject: [PATCH] [R] address some lintr warnings (#8609) --- R-package/R/callbacks.R | 2 +- R-package/R/xgb.Booster.R | 2 +- R-package/R/xgb.cv.R | 6 +-- R-package/R/xgb.dump.R | 2 +- R-package/demo/caret_wrapper.R | 2 +- R-package/tests/testthat/test_basic.R | 2 +- R-package/tests/testthat/test_dmatrix.R | 2 +- R-package/tests/testthat/test_helpers.R | 2 +- R-package/vignettes/discoverYourData.Rmd | 24 ++++++----- R-package/vignettes/xgboostPresentation.Rmd | 20 ++++----- R-package/vignettes/xgboostfromJSON.Rmd | 36 ++++++++-------- demo/data/gen_autoclaims.R | 25 ++++++++--- demo/kaggle-higgs/higgs-pred.R | 10 ++--- demo/kaggle-higgs/higgs-train.R | 8 ++-- demo/kaggle-higgs/speedtest.R | 43 +++++++++---------- demo/kaggle-otto/otto_train_pred.R | 43 +++++++++++-------- .../kaggle-otto/understandingXGBoostModel.Rmd | 20 ++++----- demo/multiclass_classification/train.R | 4 +- 18 files changed, 137 insertions(+), 116 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 957fca6d4..3c4d2e13a 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -611,7 +611,7 @@ cb.cv.predict <- function(save_models = FALSE) { #' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l') #' #' @export -cb.gblinear.history <- function(sparse=FALSE) { +cb.gblinear.history <- function(sparse = FALSE) { coefs <- NULL init <- function(env) { diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 2f1f5091c..46852cba8 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -629,7 +629,7 @@ xgb.attributes <- function(object) { #' @export xgb.config <- function(object) { handle <- xgb.get.handle(object) - .Call(XGBoosterSaveJsonConfig_R, handle); + .Call(XGBoosterSaveJsonConfig_R, handle) } #' @rdname xgb.config diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index f428f4285..27730cbc3 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -119,10 +119,10 @@ #' print(cv, verbose=TRUE) #' #' @export -xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA, - prediction = FALSE, showsd = TRUE, metrics=list(), +xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA, + prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL, - verbose = TRUE, print_every_n=1L, + verbose = TRUE, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { check.deprecation(...) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index b5b8b7eb4..a2de26c26 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -38,7 +38,7 @@ #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) #' #' @export -xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE, +xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE, dump_format = c("text", "json"), ...) { check.deprecation(...) dump_format <- match.arg(dump_format) diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R index cace1e469..0e63f27ce 100644 --- a/R-package/demo/caret_wrapper.R +++ b/R-package/demo/caret_wrapper.R @@ -34,7 +34,7 @@ df[, ID := NULL] # Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters. fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random") # train a xgbTree model using caret::train -model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl) +model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl) # Instead of tree for our boosters, you can also fit a linear regression or logistic regression model # using xgbLinear diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index ad8c8a830..4420c9105 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -404,7 +404,7 @@ test_that("Configuration works", { config <- xgb.config(bst) xgb.config(bst) <- config reloaded_config <- xgb.config(bst) - expect_equal(config, reloaded_config); + expect_equal(config, reloaded_config) }) test_that("strict_shape works", { diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index eb83544d8..74d172f69 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -53,7 +53,7 @@ test_that("xgb.DMatrix: saving, loading", { dtrain <- xgb.DMatrix(tmp_file) expect_equal(colnames(dtrain), cnames) - ft <- rep(c("c", "q"), each=length(cnames)/2) + ft <- rep(c("c", "q"), each = length(cnames) / 2) setinfo(dtrain, "feature_type", ft) expect_equal(ft, getinfo(dtrain, "feature_type")) }) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index dc61fd1f8..2eab5624f 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -440,7 +440,7 @@ test_that("xgb.plot.shap.summary works", { }) test_that("check.deprecation works", { - ttt <- function(a = NNULL, DUMMY=NULL, ...) { + ttt <- function(a = NNULL, DUMMY = NULL, ...) { check.deprecation(...) as.list((environment())) } diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 4a4e1811a..c939232a1 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -28,7 +28,9 @@ Package loading: require(xgboost) require(Matrix) require(data.table) -if (!require('vcd')) install.packages('vcd') +if (!require('vcd')) { + install.packages('vcd') +} ``` > **VCD** package is used for one of its embedded dataset only. @@ -100,7 +102,7 @@ Note that we transform it to `factor` so the algorithm treat these age groups as Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation. ```{r} -head(df[,AgeDiscret := as.factor(round(Age/10,0))]) +head(df[, AgeDiscret := as.factor(round(Age / 10, 0))]) ``` ##### Random split into two groups @@ -108,7 +110,7 @@ head(df[,AgeDiscret := as.factor(round(Age/10,0))]) Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...). ```{r} -head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]) +head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]) ``` ##### Risks in adding correlated features @@ -124,13 +126,13 @@ Fortunately, decision tree algorithms (including boosted trees) are very robust We remove ID as there is nothing to learn from this feature (it would just add some noise). ```{r, results='hide'} -df[,ID:=NULL] +df[, ID := NULL] ``` We will list the different values for the column `Treatment`: ```{r} -levels(df[,Treatment]) +levels(df[, Treatment]) ``` @@ -147,7 +149,7 @@ For example, the column `Treatment` will be replaced by two columns, `TreatmentP Column `Improved` is excluded because it will be our `label` column, the one we want to predict. ```{r, warning=FALSE,message=FALSE} -sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1] +sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1] head(sparse_matrix) ``` @@ -156,7 +158,7 @@ head(sparse_matrix) Create the output `numeric` vector (not as a sparse `Matrix`): ```{r} -output_vector <- df[,Improved] == "Marked" +output_vector <- df[, Improved] == "Marked" ``` 1. set `Y` vector to `0`; @@ -170,7 +172,7 @@ The code below is very usual. For more information, you can look at the document ```{r} bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, - eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic") + eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic") ``` @@ -219,7 +221,7 @@ For that purpose we will execute the same function as above but using two more p importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector) # Cleaning for better display -importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] +importanceClean <- importanceRaw[, `:=`(Cover = NULL, Frequency = NULL)] head(importanceClean) ``` @@ -321,8 +323,8 @@ If you want to try Random Forests algorithm, you can tweak XGBoost parameters! For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns: ```{r, warning=FALSE, message=FALSE} -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 64038e02d..34f5963d5 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -52,9 +52,9 @@ It has several features: For weekly updated version (highly recommended), install from *GitHub*: ```{r installGithub, eval=FALSE} -install.packages("drat", repos="https://cran.rstudio.com") +install.packages("drat", repos = "https://cran.rstudio.com") drat:::addRepo("dmlc") -install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") +install.packages("xgboost", repos = "http://dmlc.ml/drat/", type = "source") ``` > *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first. @@ -101,8 +101,8 @@ Why *split* the dataset in two parts? In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen. ```{r datasetLoading, results='hold', message=F, warning=F} -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test ``` @@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod For the following advanced features, we need to put data in `xgb.DMatrix` as explained above. ```{r DMatrix, message=F, warning=F} -dtrain <- xgb.DMatrix(data = train$data, label=train$label) -dtest <- xgb.DMatrix(data = test$data, label=test$label) +dtrain <- xgb.DMatrix(data = train$data, label = train$label) +dtest <- xgb.DMatrix(data = test$data, label = test$label) ``` ### Measure learning progress with xgb.train @@ -332,7 +332,7 @@ One way to measure progress in learning of a model is to provide to **XGBoost** For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name. ```{r watchlist, message=F, warning=F} -watchlist <- list(train=dtrain, test=dtest) +watchlist <- list(train = dtrain, test = dtest) bst <- xgb.train( data = dtrain @@ -425,7 +425,7 @@ Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereaf ```{r getinfo, message=F, warning=F} label <- getinfo(dtest, "label") pred <- predict(bst, dtest) -err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) +err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) print(paste("test-error=", err)) ``` @@ -479,7 +479,7 @@ bst2 <- xgb.load("xgboost.model") pred2 <- predict(bst2, test$data) # And now the test -print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) +print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) ``` ```{r clean, include=FALSE} @@ -503,7 +503,7 @@ bst3 <- xgb.load(rawVec) pred3 <- predict(bst3, test$data) # pred2 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred)))) +print(paste("sum(abs(pred3-pred))=", sum(abs(pred2 - pred)))) ``` > Again `0`? It seems that `XGBoost` works pretty well! diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd index 544186830..0ea2f56cb 100644 --- a/R-package/vignettes/xgboostfromJSON.Rmd +++ b/R-package/vignettes/xgboostfromJSON.Rmd @@ -30,7 +30,7 @@ For the purpose of this tutorial we will load the xgboost, jsonlite, and float p require(xgboost) require(jsonlite) require(float) -options(digits=22) +options(digits = 22) ``` We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled. @@ -50,10 +50,10 @@ labels <- c(1, 1, 1, 0, 0, 0, 0, 0, 0) -data <- data.frame(dates = dates, labels=labels) +data <- data.frame(dates = dates, labels = labels) bst <- xgboost( - data = as.matrix(data$dates), + data = as.matrix(data$dates), label = labels, nthread = 2, nrounds = 1, @@ -69,7 +69,7 @@ We will now dump the model to JSON and attempt to illustrate a variety of issues First let's dump the model to JSON: ```{r} -bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format='json') +bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format = 'json') bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE) node <- bst_from_json[[1]] cat(bst_json) @@ -78,10 +78,10 @@ cat(bst_json) The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated. ```{r} -bst_preds_logodds <- predict(bst,as.matrix(data$dates), outputmargin = TRUE) +bst_preds_logodds <- predict(bst, as.matrix(data$dates), outputmargin = TRUE) # calculate the logodds values using the JSON representation -bst_from_json_logodds <- ifelse(data$dates ntop, "s", "b") outdata <- list("EventId" = idx, "RankOrder" = rorder, "Class" = plabel) -write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE) +write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE) diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R index 54e36a73f..6f37040f8 100644 --- a/demo/kaggle-higgs/higgs-train.R +++ b/demo/kaggle-higgs/higgs-train.R @@ -4,14 +4,14 @@ require(methods) testsize <- 550000 -dtrain <- read.csv("data/training.csv", header=TRUE) +dtrain <- read.csv("data/training.csv", header = TRUE) dtrain[33] <- dtrain[33] == "s" label <- as.numeric(dtrain[[33]]) data <- as.matrix(dtrain[2:31]) weight <- as.numeric(dtrain[[32]]) * testsize / length(label) -sumwpos <- sum(weight * (label==1.0)) -sumwneg <- sum(weight * (label==0.0)) +sumwpos <- sum(weight * (label == 1.0)) +sumwneg <- sum(weight * (label == 0.0)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0) @@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw", watchlist <- list("train" = xgmat) nrounds <- 120 print ("loading data end, start to boost trees") -bst <- xgb.train(param, xgmat, nrounds, watchlist ); +bst <- xgb.train(param, xgmat, nrounds, watchlist) # save out model xgb.save(bst, "higgs.model") print ('finish training') diff --git a/demo/kaggle-higgs/speedtest.R b/demo/kaggle-higgs/speedtest.R index 12924aa54..2794fdd75 100644 --- a/demo/kaggle-higgs/speedtest.R +++ b/demo/kaggle-higgs/speedtest.R @@ -5,10 +5,10 @@ require(methods) testsize <- 550000 -dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001) -dtrain$Label <- as.numeric(dtrain$Label=='s') +dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001) +dtrain$Label <- as.numeric(dtrain$Label == 's') # gbm.time = system.time({ -# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, +# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, # interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1, # verbose = TRUE) # }) @@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s') data <- as.matrix(dtrain[2:31]) weight <- as.numeric(dtrain[[32]]) * testsize / length(label) -sumwpos <- sum(weight * (label==1.0)) -sumwneg <- sum(weight * (label==0.0)) +sumwpos <- sum(weight * (label == 1.0)) +sumwneg <- sum(weight * (label == 0.0)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) xgboost.time <- list() -threads <- c(1,2,4,8,16) +threads <- c(1, 2, 4, 8, 16) for (i in 1:length(threads)){ thread <- threads[i] xgboost.time[[i]] <- system.time({ @@ -40,7 +40,7 @@ for (i in 1:length(threads)){ watchlist <- list("train" = xgmat) nrounds <- 120 print ("loading data end, start to boost trees") - bst <- xgb.train(param, xgmat, nrounds, watchlist ); + bst <- xgb.train(param, xgmat, nrounds, watchlist) # save out model xgb.save(bst, "higgs.model") print ('finish training') @@ -49,22 +49,21 @@ for (i in 1:length(threads)){ xgboost.time # [[1]] -# user system elapsed -# 99.015 0.051 98.982 -# +# user system elapsed +# 99.015 0.051 98.982 +# # [[2]] -# user system elapsed -# 100.268 0.317 55.473 -# +# user system elapsed +# 100.268 0.317 55.473 +# # [[3]] -# user system elapsed -# 111.682 0.777 35.963 -# +# user system elapsed +# 111.682 0.777 35.963 +# # [[4]] -# user system elapsed -# 149.396 1.851 32.661 -# +# user system elapsed +# 149.396 1.851 32.661 +# # [[5]] -# user system elapsed -# 157.390 5.988 40.949 - +# user system elapsed +# 157.390 5.988 40.949 diff --git a/demo/kaggle-otto/otto_train_pred.R b/demo/kaggle-otto/otto_train_pred.R index c60a123eb..7f54aebcd 100644 --- a/demo/kaggle-otto/otto_train_pred.R +++ b/demo/kaggle-otto/otto_train_pred.R @@ -1,20 +1,20 @@ require(xgboost) require(methods) -train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE) -test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE) -train <- train[,-1] -test <- test[,-1] +train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE) +test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE) +train <- train[, -1] +test <- test[, -1] -y <- train[,ncol(train)] -y <- gsub('Class_','',y) -y <- as.integer(y)-1 # xgboost take features in [0,numOfClass) +y <- train[, ncol(train)] +y <- gsub('Class_', '', y) +y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass) -x <- rbind(train[,-ncol(train)],test) +x <- rbind(train[, -ncol(train)], test) x <- as.matrix(x) -x <- matrix(as.numeric(x),nrow(x),ncol(x)) +x <- matrix(as.numeric(x), nrow(x), ncol(x)) trind <- 1:length(y) -teind <- (nrow(train)+1):nrow(x) +teind <- (nrow(train) + 1):nrow(x) # Set necessary parameter param <- list("objective" = "multi:softprob", @@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob", # Run Cross Validation cv.nrounds <- 50 -bst.cv <- xgb.cv(param=param, data = x[trind,], label = y, - nfold = 3, nrounds=cv.nrounds) +bst.cv <- xgb.cv( + param = param + , data = x[trind, ] + , label = y + , nfold = 3 + , nrounds = cv.nrounds +) # Train the model nrounds <- 50 -bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds) +bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds) # Make prediction -pred <- predict(bst,x[teind,]) -pred <- matrix(pred,9,length(pred)/9) +pred <- predict(bst, x[teind, ]) +pred <- matrix(pred, 9, length(pred) / 9) pred <- t(pred) # Output submission -pred <- format(pred, digits=2,scientific=F) # shrink the size of submission -pred <- data.frame(1:nrow(pred),pred) -names(pred) <- c('id', paste0('Class_',1:9)) -write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE) +pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission +pred <- data.frame(1:nrow(pred), pred) +names(pred) <- c('id', paste0('Class_', 1:9)) +write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE) diff --git a/demo/kaggle-otto/understandingXGBoostModel.Rmd b/demo/kaggle-otto/understandingXGBoostModel.Rmd index ac8898f48..1939fa10c 100644 --- a/demo/kaggle-otto/understandingXGBoostModel.Rmd +++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd @@ -31,7 +31,7 @@ require(methods) require(data.table) require(magrittr) train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE) -test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE) +test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE) ``` > `magrittr` and `data.table` are here to make the code cleaner and much more rapid. @@ -42,13 +42,13 @@ Let's explore the dataset. dim(train) # Training content -train[1:6,1:5, with =FALSE] +train[1:6, 1:5, with = FALSE] # Test dataset dimensions dim(test) # Test content -test[1:6,1:5, with =FALSE] +test[1:6, 1:5, with = FALSE] ``` > We only display the 6 first rows and 5 first columns for convenience @@ -87,7 +87,7 @@ For that purpose, we will: ```{r classToIntegers} # Convert from classes to numbers y <- train[, nameLastCol, with = FALSE][[1]] %>% - gsub('Class_','',.) %>% + gsub('Class_', '', .) %>% as.integer %>% subtract(., 1) @@ -98,14 +98,14 @@ y[1:5] We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels! ```{r deleteCols, results='hide'} -train[, nameLastCol:=NULL, with = FALSE] +train[, nameLastCol := NULL, with = FALSE] ``` `data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format. ```{r convertToNumericMatrix} -trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix -testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix +trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix +testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix ``` Model training @@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob", cv.nrounds <- 5 cv.nfold <- 3 -bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y, +bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y, nfold = cv.nfold, nrounds = cv.nrounds) ``` > As we can see the error rate is low on the test dataset (for a 5mn trained model). @@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!! ```{r modelTraining} nrounds <- 50 -bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds) +bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds) ``` Model understanding @@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]] importance_matrix <- xgb.importance(names, model = bst) # Nice graph -xgb.plot.importance(importance_matrix[1:10,]) +xgb.plot.importance(importance_matrix[1:10, ]) ``` > To make it understandable we first extract the column names from the `Matrix`. diff --git a/demo/multiclass_classification/train.R b/demo/multiclass_classification/train.R index a9ded8748..390690bda 100644 --- a/demo/multiclass_classification/train.R +++ b/demo/multiclass_classification/train.R @@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)), idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE) -train <- df[idx,] -test <- df[-idx,] +train <- df[idx, ] +test <- df[-idx, ] train_x <- train[, 1:34] train_y <- train[, V35]