[R] address some lintr warnings (#8609)

This commit is contained in:
James Lamb
2022-12-17 04:36:14 -06:00
committed by GitHub
parent 53e6e32718
commit 17ce1f26c8
18 changed files with 137 additions and 116 deletions

View File

@@ -1,20 +1,20 @@
require(xgboost)
require(methods)
train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
train <- train[,-1]
test <- test[,-1]
train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
train <- train[, -1]
test <- test[, -1]
y <- train[,ncol(train)]
y <- gsub('Class_','',y)
y <- as.integer(y)-1 # xgboost take features in [0,numOfClass)
y <- train[, ncol(train)]
y <- gsub('Class_', '', y)
y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass)
x <- rbind(train[,-ncol(train)],test)
x <- rbind(train[, -ncol(train)], test)
x <- as.matrix(x)
x <- matrix(as.numeric(x),nrow(x),ncol(x))
x <- matrix(as.numeric(x), nrow(x), ncol(x))
trind <- 1:length(y)
teind <- (nrow(train)+1):nrow(x)
teind <- (nrow(train) + 1):nrow(x)
# Set necessary parameter
param <- list("objective" = "multi:softprob",
@@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",
# Run Cross Validation
cv.nrounds <- 50
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y,
nfold = 3, nrounds=cv.nrounds)
bst.cv <- xgb.cv(
param = param
, data = x[trind, ]
, label = y
, nfold = 3
, nrounds = cv.nrounds
)
# Train the model
nrounds <- 50
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)
# Make prediction
pred <- predict(bst,x[teind,])
pred <- matrix(pred,9,length(pred)/9)
pred <- predict(bst, x[teind, ])
pred <- matrix(pred, 9, length(pred) / 9)
pred <- t(pred)
# Output submission
pred <- format(pred, digits=2,scientific=F) # shrink the size of submission
pred <- data.frame(1:nrow(pred),pred)
names(pred) <- c('id', paste0('Class_',1:9))
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
pred <- data.frame(1:nrow(pred), pred)
names(pred) <- c('id', paste0('Class_', 1:9))
write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)

View File

@@ -31,7 +31,7 @@ require(methods)
require(data.table)
require(magrittr)
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
```
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
@@ -42,13 +42,13 @@ Let's explore the dataset.
dim(train)
# Training content
train[1:6,1:5, with =FALSE]
train[1:6, 1:5, with = FALSE]
# Test dataset dimensions
dim(test)
# Test content
test[1:6,1:5, with =FALSE]
test[1:6, 1:5, with = FALSE]
```
> We only display the 6 first rows and 5 first columns for convenience
@@ -87,7 +87,7 @@ For that purpose, we will:
```{r classToIntegers}
# Convert from classes to numbers
y <- train[, nameLastCol, with = FALSE][[1]] %>%
gsub('Class_','',.) %>%
gsub('Class_', '', .) %>%
as.integer %>%
subtract(., 1)
@@ -98,14 +98,14 @@ y[1:5]
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
```{r deleteCols, results='hide'}
train[, nameLastCol:=NULL, with = FALSE]
train[, nameLastCol := NULL, with = FALSE]
```
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
```{r convertToNumericMatrix}
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
```
Model training
@@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
cv.nrounds <- 5
cv.nfold <- 3
bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y,
bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
nfold = cv.nfold, nrounds = cv.nrounds)
```
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
@@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!
```{r modelTraining}
nrounds <- 50
bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
```
Model understanding
@@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
importance_matrix <- xgb.importance(names, model = bst)
# Nice graph
xgb.plot.importance(importance_matrix[1:10,])
xgb.plot.importance(importance_matrix[1:10, ])
```
> To make it understandable we first extract the column names from the `Matrix`.