[R] address some lintr warnings (#8609)
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
site <- 'http://cran.r-project.org'
|
||||
if (!require('dummies'))
|
||||
install.packages('dummies', repos=site)
|
||||
if (!require('insuranceData'))
|
||||
install.packages('insuranceData', repos=site)
|
||||
if (!require('dummies')) {
|
||||
install.packages('dummies', repos = site)
|
||||
}
|
||||
if (!require('insuranceData')) {
|
||||
install.packages('insuranceData', repos = site)
|
||||
}
|
||||
|
||||
library(dummies)
|
||||
library(insuranceData)
|
||||
@@ -14,5 +16,16 @@ data$STATE <- as.factor(data$STATE)
|
||||
data$CLASS <- as.factor(data$CLASS)
|
||||
data$GENDER <- as.factor(data$GENDER)
|
||||
|
||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
|
||||
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
||||
data.dummy <- dummy.data.frame(
|
||||
data
|
||||
, dummy.class = 'factor'
|
||||
, omit.constants = TRUE
|
||||
)
|
||||
write.table(
|
||||
data.dummy
|
||||
, 'autoclaims.csv'
|
||||
, sep = ','
|
||||
, row.names = FALSE
|
||||
, col.names = FALSE
|
||||
, quote = FALSE
|
||||
)
|
||||
|
||||
@@ -4,21 +4,21 @@ require(methods)
|
||||
|
||||
modelfile <- "higgs.model"
|
||||
outfile <- "higgs.pred.csv"
|
||||
dtest <- read.csv("data/test.csv", header=TRUE)
|
||||
dtest <- read.csv("data/test.csv", header = TRUE)
|
||||
data <- as.matrix(dtest[2:31])
|
||||
idx <- dtest[[1]]
|
||||
|
||||
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||
bst <- xgb.load(modelfile=modelfile)
|
||||
bst <- xgb.load(modelfile = modelfile)
|
||||
ypred <- predict(bst, xgmat)
|
||||
|
||||
rorder <- rank(ypred, ties.method="first")
|
||||
rorder <- rank(ypred, ties.method = "first")
|
||||
|
||||
threshold <- 0.15
|
||||
# to be completed
|
||||
ntop <- length(rorder) - as.integer(threshold*length(rorder))
|
||||
ntop <- length(rorder) - as.integer(threshold * length(rorder))
|
||||
plabel <- ifelse(rorder > ntop, "s", "b")
|
||||
outdata <- list("EventId" = idx,
|
||||
"RankOrder" = rorder,
|
||||
"Class" = plabel)
|
||||
write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE)
|
||||
write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE)
|
||||
|
||||
@@ -4,14 +4,14 @@ require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE)
|
||||
dtrain <- read.csv("data/training.csv", header = TRUE)
|
||||
dtrain[33] <- dtrain[33] == "s"
|
||||
label <- as.numeric(dtrain[[33]])
|
||||
data <- as.matrix(dtrain[2:31])
|
||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||
|
||||
sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
sumwpos <- sum(weight * (label == 1.0))
|
||||
sumwneg <- sum(weight * (label == 0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||
@@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw",
|
||||
watchlist <- list("train" = xgmat)
|
||||
nrounds <- 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
|
||||
@@ -5,10 +5,10 @@ require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
||||
dtrain$Label <- as.numeric(dtrain$Label=='s')
|
||||
dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001)
|
||||
dtrain$Label <- as.numeric(dtrain$Label == 's')
|
||||
# gbm.time = system.time({
|
||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
||||
# verbose = TRUE)
|
||||
# })
|
||||
@@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s')
|
||||
data <- as.matrix(dtrain[2:31])
|
||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||
|
||||
sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
sumwpos <- sum(weight * (label == 1.0))
|
||||
sumwneg <- sum(weight * (label == 0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgboost.time <- list()
|
||||
threads <- c(1,2,4,8,16)
|
||||
threads <- c(1, 2, 4, 8, 16)
|
||||
for (i in 1:length(threads)){
|
||||
thread <- threads[i]
|
||||
xgboost.time[[i]] <- system.time({
|
||||
@@ -40,7 +40,7 @@ for (i in 1:length(threads)){
|
||||
watchlist <- list("train" = xgmat)
|
||||
nrounds <- 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
@@ -49,22 +49,21 @@ for (i in 1:length(threads)){
|
||||
|
||||
xgboost.time
|
||||
# [[1]]
|
||||
# user system elapsed
|
||||
# 99.015 0.051 98.982
|
||||
#
|
||||
# user system elapsed
|
||||
# 99.015 0.051 98.982
|
||||
#
|
||||
# [[2]]
|
||||
# user system elapsed
|
||||
# 100.268 0.317 55.473
|
||||
#
|
||||
# user system elapsed
|
||||
# 100.268 0.317 55.473
|
||||
#
|
||||
# [[3]]
|
||||
# user system elapsed
|
||||
# 111.682 0.777 35.963
|
||||
#
|
||||
# user system elapsed
|
||||
# 111.682 0.777 35.963
|
||||
#
|
||||
# [[4]]
|
||||
# user system elapsed
|
||||
# 149.396 1.851 32.661
|
||||
#
|
||||
# user system elapsed
|
||||
# 149.396 1.851 32.661
|
||||
#
|
||||
# [[5]]
|
||||
# user system elapsed
|
||||
# 157.390 5.988 40.949
|
||||
|
||||
# user system elapsed
|
||||
# 157.390 5.988 40.949
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||
test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||
train <- train[,-1]
|
||||
test <- test[,-1]
|
||||
train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
train <- train[, -1]
|
||||
test <- test[, -1]
|
||||
|
||||
y <- train[,ncol(train)]
|
||||
y <- gsub('Class_','',y)
|
||||
y <- as.integer(y)-1 # xgboost take features in [0,numOfClass)
|
||||
y <- train[, ncol(train)]
|
||||
y <- gsub('Class_', '', y)
|
||||
y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass)
|
||||
|
||||
x <- rbind(train[,-ncol(train)],test)
|
||||
x <- rbind(train[, -ncol(train)], test)
|
||||
x <- as.matrix(x)
|
||||
x <- matrix(as.numeric(x),nrow(x),ncol(x))
|
||||
x <- matrix(as.numeric(x), nrow(x), ncol(x))
|
||||
trind <- 1:length(y)
|
||||
teind <- (nrow(train)+1):nrow(x)
|
||||
teind <- (nrow(train) + 1):nrow(x)
|
||||
|
||||
# Set necessary parameter
|
||||
param <- list("objective" = "multi:softprob",
|
||||
@@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",
|
||||
|
||||
# Run Cross Validation
|
||||
cv.nrounds <- 50
|
||||
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y,
|
||||
nfold = 3, nrounds=cv.nrounds)
|
||||
bst.cv <- xgb.cv(
|
||||
param = param
|
||||
, data = x[trind, ]
|
||||
, label = y
|
||||
, nfold = 3
|
||||
, nrounds = cv.nrounds
|
||||
)
|
||||
|
||||
# Train the model
|
||||
nrounds <- 50
|
||||
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
|
||||
bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)
|
||||
|
||||
# Make prediction
|
||||
pred <- predict(bst,x[teind,])
|
||||
pred <- matrix(pred,9,length(pred)/9)
|
||||
pred <- predict(bst, x[teind, ])
|
||||
pred <- matrix(pred, 9, length(pred) / 9)
|
||||
pred <- t(pred)
|
||||
|
||||
# Output submission
|
||||
pred <- format(pred, digits=2,scientific=F) # shrink the size of submission
|
||||
pred <- data.frame(1:nrow(pred),pred)
|
||||
names(pred) <- c('id', paste0('Class_',1:9))
|
||||
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
|
||||
pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
|
||||
pred <- data.frame(1:nrow(pred), pred)
|
||||
names(pred) <- c('id', paste0('Class_', 1:9))
|
||||
write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)
|
||||
|
||||
@@ -31,7 +31,7 @@ require(methods)
|
||||
require(data.table)
|
||||
require(magrittr)
|
||||
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
|
||||
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
|
||||
test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
```
|
||||
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
||||
|
||||
@@ -42,13 +42,13 @@ Let's explore the dataset.
|
||||
dim(train)
|
||||
|
||||
# Training content
|
||||
train[1:6,1:5, with =FALSE]
|
||||
train[1:6, 1:5, with = FALSE]
|
||||
|
||||
# Test dataset dimensions
|
||||
dim(test)
|
||||
|
||||
# Test content
|
||||
test[1:6,1:5, with =FALSE]
|
||||
test[1:6, 1:5, with = FALSE]
|
||||
```
|
||||
> We only display the 6 first rows and 5 first columns for convenience
|
||||
|
||||
@@ -87,7 +87,7 @@ For that purpose, we will:
|
||||
```{r classToIntegers}
|
||||
# Convert from classes to numbers
|
||||
y <- train[, nameLastCol, with = FALSE][[1]] %>%
|
||||
gsub('Class_','',.) %>%
|
||||
gsub('Class_', '', .) %>%
|
||||
as.integer %>%
|
||||
subtract(., 1)
|
||||
|
||||
@@ -98,14 +98,14 @@ y[1:5]
|
||||
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
||||
|
||||
```{r deleteCols, results='hide'}
|
||||
train[, nameLastCol:=NULL, with = FALSE]
|
||||
train[, nameLastCol := NULL, with = FALSE]
|
||||
```
|
||||
|
||||
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
||||
|
||||
```{r convertToNumericMatrix}
|
||||
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
|
||||
testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
|
||||
trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||
testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||
```
|
||||
|
||||
Model training
|
||||
@@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
|
||||
cv.nrounds <- 5
|
||||
cv.nfold <- 3
|
||||
|
||||
bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y,
|
||||
bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
|
||||
nfold = cv.nfold, nrounds = cv.nrounds)
|
||||
```
|
||||
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
|
||||
@@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!
|
||||
|
||||
```{r modelTraining}
|
||||
nrounds <- 50
|
||||
bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
|
||||
bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
|
||||
```
|
||||
|
||||
Model understanding
|
||||
@@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
|
||||
importance_matrix <- xgb.importance(names, model = bst)
|
||||
|
||||
# Nice graph
|
||||
xgb.plot.importance(importance_matrix[1:10,])
|
||||
xgb.plot.importance(importance_matrix[1:10, ])
|
||||
```
|
||||
|
||||
> To make it understandable we first extract the column names from the `Matrix`.
|
||||
|
||||
@@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),
|
||||
|
||||
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
|
||||
|
||||
train <- df[idx,]
|
||||
test <- df[-idx,]
|
||||
train <- df[idx, ]
|
||||
test <- df[-idx, ]
|
||||
|
||||
train_x <- train[, 1:34]
|
||||
train_y <- train[, V35]
|
||||
|
||||
Reference in New Issue
Block a user