[R] address some lintr warnings (#8609)

2022-12-17 04:36:14 -06:00
parent 53e6e32718
commit 17ce1f26c8
18 changed files with 137 additions and 116 deletions
--- a/demo/data/gen_autoclaims.R
+++ b/demo/data/gen_autoclaims.R
@@ -1,8 +1,10 @@
 site <- 'http://cran.r-project.org'
-if (!require('dummies'))
-    install.packages('dummies', repos=site)
-if (!require('insuranceData'))
-    install.packages('insuranceData', repos=site)
+if (!require('dummies')) {
+    install.packages('dummies', repos = site)
+}
+if (!require('insuranceData')) {
+    install.packages('insuranceData', repos = site)
+}

 library(dummies)
 library(insuranceData)
@@ -14,5 +16,16 @@ data$STATE <- as.factor(data$STATE)
 data$CLASS <- as.factor(data$CLASS)
 data$GENDER <- as.factor(data$GENDER)

-data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
-write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
+data.dummy <- dummy.data.frame(
+    data
+    , dummy.class = 'factor'
+    , omit.constants = TRUE
+)
+write.table(
+    data.dummy
+    , 'autoclaims.csv'
+    , sep = ','
+    , row.names = FALSE
+    , col.names = FALSE
+    , quote = FALSE
+)
--- a/demo/kaggle-higgs/higgs-pred.R
+++ b/demo/kaggle-higgs/higgs-pred.R
@@ -4,21 +4,21 @@ require(methods)

 modelfile <- "higgs.model"
 outfile <- "higgs.pred.csv"
-dtest <- read.csv("data/test.csv", header=TRUE)
+dtest <- read.csv("data/test.csv", header = TRUE)
 data <- as.matrix(dtest[2:31])
 idx <- dtest[[1]]

 xgmat <- xgb.DMatrix(data, missing = -999.0)
-bst <- xgb.load(modelfile=modelfile)
+bst <- xgb.load(modelfile = modelfile)
 ypred <- predict(bst, xgmat)

-rorder <- rank(ypred, ties.method="first")
+rorder <- rank(ypred, ties.method = "first")

 threshold <- 0.15
 # to be completed
-ntop <- length(rorder) - as.integer(threshold*length(rorder))
+ntop <- length(rorder) - as.integer(threshold * length(rorder))
 plabel <- ifelse(rorder > ntop, "s", "b")
 outdata <- list("EventId" = idx,
                "RankOrder" = rorder,
                "Class" = plabel)
-write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE)
+write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE)
--- a/demo/kaggle-higgs/higgs-train.R
+++ b/demo/kaggle-higgs/higgs-train.R
@@ -4,14 +4,14 @@ require(methods)

 testsize <- 550000

-dtrain <- read.csv("data/training.csv", header=TRUE)
+dtrain <- read.csv("data/training.csv", header = TRUE)
 dtrain[33] <- dtrain[33] == "s"
 label <- as.numeric(dtrain[[33]])
 data <- as.matrix(dtrain[2:31])
 weight <- as.numeric(dtrain[[32]]) * testsize / length(label)

-sumwpos <- sum(weight * (label==1.0))
-sumwneg <- sum(weight * (label==0.0))
+sumwpos <- sum(weight * (label == 1.0))
+sumwneg <- sum(weight * (label == 0.0))
 print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))

 xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
@@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw",
 watchlist <- list("train" = xgmat)
 nrounds <- 120
 print ("loading data end, start to boost trees")
-bst <- xgb.train(param, xgmat, nrounds, watchlist );
+bst <- xgb.train(param, xgmat, nrounds, watchlist)
 # save out model
 xgb.save(bst, "higgs.model")
 print ('finish training')
--- a/demo/kaggle-higgs/speedtest.R
+++ b/demo/kaggle-higgs/speedtest.R
@@ -5,10 +5,10 @@ require(methods)

 testsize <- 550000

-dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
-dtrain$Label <- as.numeric(dtrain$Label=='s')
+dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001)
+dtrain$Label <- as.numeric(dtrain$Label == 's')
 # gbm.time = system.time({
-#   gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, 
+#   gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
 #                    interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
 #                    verbose = TRUE)
 # })
@@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s')
 data <- as.matrix(dtrain[2:31])
 weight <- as.numeric(dtrain[[32]]) * testsize / length(label)

-sumwpos <- sum(weight * (label==1.0))
-sumwneg <- sum(weight * (label==0.0))
+sumwpos <- sum(weight * (label == 1.0))
+sumwneg <- sum(weight * (label == 0.0))
 print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))

 xgboost.time <- list()
-threads <- c(1,2,4,8,16)
+threads <- c(1, 2, 4, 8, 16)
 for (i in 1:length(threads)){
  thread <- threads[i]
  xgboost.time[[i]] <- system.time({
@@ -40,7 +40,7 @@ for (i in 1:length(threads)){
    watchlist <- list("train" = xgmat)
    nrounds <- 120
    print ("loading data end, start to boost trees")
-    bst <- xgb.train(param, xgmat, nrounds, watchlist );
+    bst <- xgb.train(param, xgmat, nrounds, watchlist)
    # save out model
    xgb.save(bst, "higgs.model")
    print ('finish training')
@@ -49,22 +49,21 @@ for (i in 1:length(threads)){

 xgboost.time
 # [[1]]
-# user  system elapsed 
-# 99.015   0.051  98.982 
-# 
+# user  system elapsed
+# 99.015   0.051  98.982
+#
 # [[2]]
-# user  system elapsed 
-# 100.268   0.317  55.473 
-# 
+# user  system elapsed
+# 100.268   0.317  55.473
+#
 # [[3]]
-# user  system elapsed 
-# 111.682   0.777  35.963 
-# 
+# user  system elapsed
+# 111.682   0.777  35.963
+#
 # [[4]]
-# user  system elapsed 
-# 149.396   1.851  32.661 
-# 
+# user  system elapsed
+# 149.396   1.851  32.661
+#
 # [[5]]
-# user  system elapsed 
-# 157.390   5.988  40.949 
-
+# user  system elapsed
+# 157.390   5.988  40.949
--- a/demo/kaggle-otto/otto_train_pred.R
+++ b/demo/kaggle-otto/otto_train_pred.R
@@ -1,20 +1,20 @@
 require(xgboost)
 require(methods)

-train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
-test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
-train <- train[,-1]
-test <- test[,-1]
+train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
+test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
+train <- train[, -1]
+test <- test[, -1]

-y <- train[,ncol(train)]
-y <- gsub('Class_','',y)
-y <- as.integer(y)-1  # xgboost take features in [0,numOfClass)
+y <- train[, ncol(train)]
+y <- gsub('Class_', '', y)
+y <- as.integer(y) - 1  # xgboost take features in [0,numOfClass)

-x <- rbind(train[,-ncol(train)],test)
+x <- rbind(train[, -ncol(train)], test)
 x <- as.matrix(x)
-x <- matrix(as.numeric(x),nrow(x),ncol(x))
+x <- matrix(as.numeric(x), nrow(x), ncol(x))
 trind <- 1:length(y)
-teind <- (nrow(train)+1):nrow(x)
+teind <- (nrow(train) + 1):nrow(x)

 # Set necessary parameter
 param <- list("objective" = "multi:softprob",
@@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",

 # Run Cross Validation
 cv.nrounds <- 50
-bst.cv <- xgb.cv(param=param, data = x[trind,], label = y, 
-                nfold = 3, nrounds=cv.nrounds)
+bst.cv <- xgb.cv(
+    param = param
+    , data = x[trind, ]
+    , label = y
+    , nfold = 3
+    , nrounds = cv.nrounds
+)

 # Train the model
 nrounds <- 50
-bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
+bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)

 # Make prediction
-pred <- predict(bst,x[teind,])
-pred <- matrix(pred,9,length(pred)/9)
+pred <- predict(bst, x[teind, ])
+pred <- matrix(pred, 9, length(pred) / 9)
 pred <- t(pred)

 # Output submission
-pred <- format(pred, digits=2,scientific=F) # shrink the size of submission
-pred <- data.frame(1:nrow(pred),pred)
-names(pred) <- c('id', paste0('Class_',1:9))
-write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
+pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
+pred <- data.frame(1:nrow(pred), pred)
+names(pred) <- c('id', paste0('Class_', 1:9))
+write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)
--- a/demo/kaggle-otto/understandingXGBoostModel.Rmd
+++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd
@@ -31,7 +31,7 @@ require(methods)
 require(data.table)
 require(magrittr)
 train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
-test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
+test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
 ```
 > `magrittr` and `data.table` are here to make the code cleaner and much more rapid.

@@ -42,13 +42,13 @@ Let's explore the dataset.
 dim(train)

 # Training content
-train[1:6,1:5, with =FALSE]
+train[1:6, 1:5, with = FALSE]

 # Test dataset dimensions
 dim(test)

 # Test content
-test[1:6,1:5, with =FALSE]
+test[1:6, 1:5, with = FALSE]
 ```
 > We only display the 6 first rows and 5 first columns for convenience

@@ -87,7 +87,7 @@ For that purpose, we will:
 ```{r classToIntegers}
 # Convert from classes to numbers
 y <- train[, nameLastCol, with = FALSE][[1]] %>%
-    gsub('Class_','',.) %>%
+    gsub('Class_', '', .) %>%
    as.integer %>%
    subtract(., 1)

@@ -98,14 +98,14 @@ y[1:5]
 We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!

 ```{r deleteCols, results='hide'}
-train[, nameLastCol:=NULL, with = FALSE]
+train[, nameLastCol := NULL, with = FALSE]
 ```

 `data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.

 ```{r convertToNumericMatrix}
-trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
-testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
+trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
+testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
 ```

 Model training
@@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
 cv.nrounds <- 5
 cv.nfold <- 3

-bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y,
+bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
                nfold = cv.nfold, nrounds = cv.nrounds)
 ```
 > As we can see the error rate is low on the test dataset (for a 5mn trained model).
@@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!

 ```{r modelTraining}
 nrounds <- 50
-bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
+bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
 ```

 Model understanding
@@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
 importance_matrix <- xgb.importance(names, model = bst)

 # Nice graph
-xgb.plot.importance(importance_matrix[1:10,])
+xgb.plot.importance(importance_matrix[1:10, ])
 ```

 > To make it understandable we first extract the column names from the `Matrix`.
--- a/demo/multiclass_classification/train.R
+++ b/demo/multiclass_classification/train.R
@@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),

 idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)

-train <- df[idx,]
-test <- df[-idx,]
+train <- df[idx, ]
+test <- df[-idx, ]

 train_x <- train[, 1:34]
 train_y <- train[, V35]