[R] address some lintr warnings (#8609)
This commit is contained in:
parent
53e6e32718
commit
17ce1f26c8
@ -611,7 +611,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
||||
#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
|
||||
#'
|
||||
#' @export
|
||||
cb.gblinear.history <- function(sparse=FALSE) {
|
||||
cb.gblinear.history <- function(sparse = FALSE) {
|
||||
coefs <- NULL
|
||||
|
||||
init <- function(env) {
|
||||
|
||||
@ -629,7 +629,7 @@ xgb.attributes <- function(object) {
|
||||
#' @export
|
||||
xgb.config <- function(object) {
|
||||
handle <- xgb.get.handle(object)
|
||||
.Call(XGBoosterSaveJsonConfig_R, handle);
|
||||
.Call(XGBoosterSaveJsonConfig_R, handle)
|
||||
}
|
||||
|
||||
#' @rdname xgb.config
|
||||
|
||||
@ -119,10 +119,10 @@
|
||||
#' print(cv, verbose=TRUE)
|
||||
#'
|
||||
#' @export
|
||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
||||
xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||
prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
|
||||
verbose = TRUE, print_every_n=1L,
|
||||
verbose = TRUE, print_every_n = 1L,
|
||||
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
|
||||
|
||||
check.deprecation(...)
|
||||
|
||||
@ -38,7 +38,7 @@
|
||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||
#'
|
||||
#' @export
|
||||
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
|
||||
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
|
||||
dump_format = c("text", "json"), ...) {
|
||||
check.deprecation(...)
|
||||
dump_format <- match.arg(dump_format)
|
||||
|
||||
@ -34,7 +34,7 @@ df[, ID := NULL]
|
||||
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
||||
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
|
||||
# train a xgbTree model using caret::train
|
||||
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
|
||||
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
|
||||
|
||||
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
|
||||
# using xgbLinear
|
||||
|
||||
@ -404,7 +404,7 @@ test_that("Configuration works", {
|
||||
config <- xgb.config(bst)
|
||||
xgb.config(bst) <- config
|
||||
reloaded_config <- xgb.config(bst)
|
||||
expect_equal(config, reloaded_config);
|
||||
expect_equal(config, reloaded_config)
|
||||
})
|
||||
|
||||
test_that("strict_shape works", {
|
||||
|
||||
@ -53,7 +53,7 @@ test_that("xgb.DMatrix: saving, loading", {
|
||||
dtrain <- xgb.DMatrix(tmp_file)
|
||||
expect_equal(colnames(dtrain), cnames)
|
||||
|
||||
ft <- rep(c("c", "q"), each=length(cnames)/2)
|
||||
ft <- rep(c("c", "q"), each = length(cnames) / 2)
|
||||
setinfo(dtrain, "feature_type", ft)
|
||||
expect_equal(ft, getinfo(dtrain, "feature_type"))
|
||||
})
|
||||
|
||||
@ -440,7 +440,7 @@ test_that("xgb.plot.shap.summary works", {
|
||||
})
|
||||
|
||||
test_that("check.deprecation works", {
|
||||
ttt <- function(a = NNULL, DUMMY=NULL, ...) {
|
||||
ttt <- function(a = NNULL, DUMMY = NULL, ...) {
|
||||
check.deprecation(...)
|
||||
as.list((environment()))
|
||||
}
|
||||
|
||||
@ -28,7 +28,9 @@ Package loading:
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
if (!require('vcd')) install.packages('vcd')
|
||||
if (!require('vcd')) {
|
||||
install.packages('vcd')
|
||||
}
|
||||
```
|
||||
|
||||
> **VCD** package is used for one of its embedded dataset only.
|
||||
@ -100,7 +102,7 @@ Note that we transform it to `factor` so the algorithm treat these age groups as
|
||||
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
|
||||
|
||||
```{r}
|
||||
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
|
||||
head(df[, AgeDiscret := as.factor(round(Age / 10, 0))])
|
||||
```
|
||||
|
||||
##### Random split into two groups
|
||||
@ -108,7 +110,7 @@ head(df[,AgeDiscret := as.factor(round(Age/10,0))])
|
||||
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
|
||||
|
||||
```{r}
|
||||
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
|
||||
head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))])
|
||||
```
|
||||
|
||||
##### Risks in adding correlated features
|
||||
@ -124,13 +126,13 @@ Fortunately, decision tree algorithms (including boosted trees) are very robust
|
||||
We remove ID as there is nothing to learn from this feature (it would just add some noise).
|
||||
|
||||
```{r, results='hide'}
|
||||
df[,ID:=NULL]
|
||||
df[, ID := NULL]
|
||||
```
|
||||
|
||||
We will list the different values for the column `Treatment`:
|
||||
|
||||
```{r}
|
||||
levels(df[,Treatment])
|
||||
levels(df[, Treatment])
|
||||
```
|
||||
|
||||
|
||||
@ -147,7 +149,7 @@ For example, the column `Treatment` will be replaced by two columns, `TreatmentP
|
||||
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
|
||||
|
||||
```{r, warning=FALSE,message=FALSE}
|
||||
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
|
||||
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1]
|
||||
head(sparse_matrix)
|
||||
```
|
||||
|
||||
@ -156,7 +158,7 @@ head(sparse_matrix)
|
||||
Create the output `numeric` vector (not as a sparse `Matrix`):
|
||||
|
||||
```{r}
|
||||
output_vector <- df[,Improved] == "Marked"
|
||||
output_vector <- df[, Improved] == "Marked"
|
||||
```
|
||||
|
||||
1. set `Y` vector to `0`;
|
||||
@ -170,7 +172,7 @@ The code below is very usual. For more information, you can look at the document
|
||||
|
||||
```{r}
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
||||
eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
|
||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||
|
||||
```
|
||||
|
||||
@ -219,7 +221,7 @@ For that purpose we will execute the same function as above but using two more p
|
||||
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
|
||||
|
||||
# Cleaning for better display
|
||||
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
|
||||
importanceClean <- importanceRaw[, `:=`(Cover = NULL, Frequency = NULL)]
|
||||
|
||||
head(importanceClean)
|
||||
```
|
||||
@ -321,8 +323,8 @@ If you want to try Random Forests algorithm, you can tweak XGBoost parameters!
|
||||
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
|
||||
|
||||
```{r, warning=FALSE, message=FALSE}
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
|
||||
@ -52,9 +52,9 @@ It has several features:
|
||||
For weekly updated version (highly recommended), install from *GitHub*:
|
||||
|
||||
```{r installGithub, eval=FALSE}
|
||||
install.packages("drat", repos="https://cran.rstudio.com")
|
||||
install.packages("drat", repos = "https://cran.rstudio.com")
|
||||
drat:::addRepo("dmlc")
|
||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||
install.packages("xgboost", repos = "http://dmlc.ml/drat/", type = "source")
|
||||
```
|
||||
|
||||
> *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first.
|
||||
@ -101,8 +101,8 @@ Why *split* the dataset in two parts?
|
||||
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
|
||||
|
||||
```{r datasetLoading, results='hold', message=F, warning=F}
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
```
|
||||
@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
|
||||
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
|
||||
|
||||
```{r DMatrix, message=F, warning=F}
|
||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label = test$label)
|
||||
```
|
||||
|
||||
### Measure learning progress with xgb.train
|
||||
@ -332,7 +332,7 @@ One way to measure progress in learning of a model is to provide to **XGBoost**
|
||||
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
|
||||
|
||||
```{r watchlist, message=F, warning=F}
|
||||
watchlist <- list(train=dtrain, test=dtest)
|
||||
watchlist <- list(train = dtrain, test = dtest)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = dtrain
|
||||
@ -425,7 +425,7 @@ Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereaf
|
||||
```{r getinfo, message=F, warning=F}
|
||||
label <- getinfo(dtest, "label")
|
||||
pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
|
||||
print(paste("test-error=", err))
|
||||
```
|
||||
|
||||
@ -479,7 +479,7 @@ bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, test$data)
|
||||
|
||||
# And now the test
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
|
||||
```
|
||||
|
||||
```{r clean, include=FALSE}
|
||||
@ -503,7 +503,7 @@ bst3 <- xgb.load(rawVec)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2 - pred))))
|
||||
```
|
||||
|
||||
> Again `0`? It seems that `XGBoost` works pretty well!
|
||||
|
||||
@ -30,7 +30,7 @@ For the purpose of this tutorial we will load the xgboost, jsonlite, and float p
|
||||
require(xgboost)
|
||||
require(jsonlite)
|
||||
require(float)
|
||||
options(digits=22)
|
||||
options(digits = 22)
|
||||
```
|
||||
|
||||
We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled.
|
||||
@ -50,10 +50,10 @@ labels <- c(1, 1, 1,
|
||||
0, 0, 0,
|
||||
0, 0, 0)
|
||||
|
||||
data <- data.frame(dates = dates, labels=labels)
|
||||
data <- data.frame(dates = dates, labels = labels)
|
||||
|
||||
bst <- xgboost(
|
||||
data = as.matrix(data$dates),
|
||||
data = as.matrix(data$dates),
|
||||
label = labels,
|
||||
nthread = 2,
|
||||
nrounds = 1,
|
||||
@ -69,7 +69,7 @@ We will now dump the model to JSON and attempt to illustrate a variety of issues
|
||||
First let's dump the model to JSON:
|
||||
|
||||
```{r}
|
||||
bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format='json')
|
||||
bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format = 'json')
|
||||
bst_from_json <- fromJSON(bst_json, simplifyDataFrame = FALSE)
|
||||
node <- bst_from_json[[1]]
|
||||
cat(bst_json)
|
||||
@ -78,10 +78,10 @@ cat(bst_json)
|
||||
The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated.
|
||||
|
||||
```{r}
|
||||
bst_preds_logodds <- predict(bst,as.matrix(data$dates), outputmargin = TRUE)
|
||||
bst_preds_logodds <- predict(bst, as.matrix(data$dates), outputmargin = TRUE)
|
||||
|
||||
# calculate the logodds values using the JSON representation
|
||||
bst_from_json_logodds <- ifelse(data$dates<node$split_condition,
|
||||
bst_from_json_logodds <- ifelse(data$dates < node$split_condition,
|
||||
node$children[[1]]$leaf,
|
||||
node$children[[2]]$leaf)
|
||||
|
||||
@ -106,19 +106,19 @@ At this stage two things happened:
|
||||
To explain this, let's repeat the comparison and round to two decimals:
|
||||
|
||||
```{r}
|
||||
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2)
|
||||
round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
|
||||
```
|
||||
|
||||
If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree:
|
||||
|
||||
```{r}
|
||||
# now convert the dates to floats first
|
||||
bst_from_json_logodds <- ifelse(fl(data$dates)<node$split_condition,
|
||||
bst_from_json_logodds <- ifelse(fl(data$dates) < node$split_condition,
|
||||
node$children[[1]]$leaf,
|
||||
node$children[[2]]$leaf)
|
||||
|
||||
# test that values are equal
|
||||
round(bst_preds_logodds,2) == round(bst_from_json_logodds,2)
|
||||
round(bst_preds_logodds, 2) == round(bst_from_json_logodds, 2)
|
||||
```
|
||||
|
||||
What's the lesson? If we are going to work with an imported JSON model, any data must be converted to floats first. In this case, since '20180131' cannot be represented as a 32-bit float, it is rounded up to 20180132, as shown here:
|
||||
@ -143,7 +143,7 @@ None are exactly equal. What happened? Although we've converted the data to 32
|
||||
|
||||
```{r}
|
||||
# now convert the dates to floats first
|
||||
bst_from_json_logodds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
||||
bst_from_json_logodds <- ifelse(fl(data$dates) < fl(node$split_condition),
|
||||
as.numeric(fl(node$children[[1]]$leaf)),
|
||||
as.numeric(fl(node$children[[2]]$leaf)))
|
||||
|
||||
@ -160,12 +160,13 @@ We were able to get the log-odds to agree, so now let's manually calculate the s
|
||||
|
||||
|
||||
```{r}
|
||||
bst_preds <- predict(bst,as.matrix(data$dates))
|
||||
bst_preds <- predict(bst, as.matrix(data$dates))
|
||||
|
||||
# calculate the predictions casting doubles to floats
|
||||
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
||||
as.numeric(1/(1+exp(-1*fl(node$children[[1]]$leaf)))),
|
||||
as.numeric(1/(1+exp(-1*fl(node$children[[2]]$leaf))))
|
||||
bst_from_json_preds <- ifelse(
|
||||
fl(data$dates) < fl(node$split_condition)
|
||||
, as.numeric(1 / (1 + exp(-1 * fl(node$children[[1]]$leaf))))
|
||||
, as.numeric(1 / (1 + exp(-1 * fl(node$children[[2]]$leaf))))
|
||||
)
|
||||
|
||||
# test that values are equal
|
||||
@ -177,9 +178,10 @@ None are exactly equal again. What is going on here? Well, since we are using
|
||||
How do we fix this? We have to ensure we use the correct data types everywhere and the correct operators. If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
|
||||
```{r}
|
||||
# calculate the predictions casting doubles to floats
|
||||
bst_from_json_preds <- ifelse(fl(data$dates)<fl(node$split_condition),
|
||||
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[1]]$leaf)))),
|
||||
as.numeric(fl(1)/(fl(1)+exp(fl(-1)*fl(node$children[[2]]$leaf))))
|
||||
bst_from_json_preds <- ifelse(
|
||||
fl(data$dates) < fl(node$split_condition)
|
||||
, as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[1]]$leaf))))
|
||||
, as.numeric(fl(1) / (fl(1) + exp(fl(-1) * fl(node$children[[2]]$leaf))))
|
||||
)
|
||||
|
||||
# test that values are equal
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
site <- 'http://cran.r-project.org'
|
||||
if (!require('dummies'))
|
||||
install.packages('dummies', repos=site)
|
||||
if (!require('insuranceData'))
|
||||
install.packages('insuranceData', repos=site)
|
||||
if (!require('dummies')) {
|
||||
install.packages('dummies', repos = site)
|
||||
}
|
||||
if (!require('insuranceData')) {
|
||||
install.packages('insuranceData', repos = site)
|
||||
}
|
||||
|
||||
library(dummies)
|
||||
library(insuranceData)
|
||||
@ -14,5 +16,16 @@ data$STATE <- as.factor(data$STATE)
|
||||
data$CLASS <- as.factor(data$CLASS)
|
||||
data$GENDER <- as.factor(data$GENDER)
|
||||
|
||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
|
||||
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
||||
data.dummy <- dummy.data.frame(
|
||||
data
|
||||
, dummy.class = 'factor'
|
||||
, omit.constants = TRUE
|
||||
)
|
||||
write.table(
|
||||
data.dummy
|
||||
, 'autoclaims.csv'
|
||||
, sep = ','
|
||||
, row.names = FALSE
|
||||
, col.names = FALSE
|
||||
, quote = FALSE
|
||||
)
|
||||
|
||||
@ -4,21 +4,21 @@ require(methods)
|
||||
|
||||
modelfile <- "higgs.model"
|
||||
outfile <- "higgs.pred.csv"
|
||||
dtest <- read.csv("data/test.csv", header=TRUE)
|
||||
dtest <- read.csv("data/test.csv", header = TRUE)
|
||||
data <- as.matrix(dtest[2:31])
|
||||
idx <- dtest[[1]]
|
||||
|
||||
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||
bst <- xgb.load(modelfile=modelfile)
|
||||
bst <- xgb.load(modelfile = modelfile)
|
||||
ypred <- predict(bst, xgmat)
|
||||
|
||||
rorder <- rank(ypred, ties.method="first")
|
||||
rorder <- rank(ypred, ties.method = "first")
|
||||
|
||||
threshold <- 0.15
|
||||
# to be completed
|
||||
ntop <- length(rorder) - as.integer(threshold*length(rorder))
|
||||
ntop <- length(rorder) - as.integer(threshold * length(rorder))
|
||||
plabel <- ifelse(rorder > ntop, "s", "b")
|
||||
outdata <- list("EventId" = idx,
|
||||
"RankOrder" = rorder,
|
||||
"Class" = plabel)
|
||||
write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE)
|
||||
write.csv(outdata, file = outfile, quote = FALSE, row.names = FALSE)
|
||||
|
||||
@ -4,14 +4,14 @@ require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE)
|
||||
dtrain <- read.csv("data/training.csv", header = TRUE)
|
||||
dtrain[33] <- dtrain[33] == "s"
|
||||
label <- as.numeric(dtrain[[33]])
|
||||
data <- as.matrix(dtrain[2:31])
|
||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||
|
||||
sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
sumwpos <- sum(weight * (label == 1.0))
|
||||
sumwneg <- sum(weight * (label == 0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||
@ -25,7 +25,7 @@ param <- list("objective" = "binary:logitraw",
|
||||
watchlist <- list("train" = xgmat)
|
||||
nrounds <- 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
|
||||
@ -5,10 +5,10 @@ require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
||||
dtrain$Label <- as.numeric(dtrain$Label=='s')
|
||||
dtrain <- read.csv("data/training.csv", header = TRUE, nrows = 350001)
|
||||
dtrain$Label <- as.numeric(dtrain$Label == 's')
|
||||
# gbm.time = system.time({
|
||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
||||
# verbose = TRUE)
|
||||
# })
|
||||
@ -20,12 +20,12 @@ dtrain$Label <- as.numeric(dtrain$Label=='s')
|
||||
data <- as.matrix(dtrain[2:31])
|
||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||
|
||||
sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
sumwpos <- sum(weight * (label == 1.0))
|
||||
sumwneg <- sum(weight * (label == 0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgboost.time <- list()
|
||||
threads <- c(1,2,4,8,16)
|
||||
threads <- c(1, 2, 4, 8, 16)
|
||||
for (i in 1:length(threads)){
|
||||
thread <- threads[i]
|
||||
xgboost.time[[i]] <- system.time({
|
||||
@ -40,7 +40,7 @@ for (i in 1:length(threads)){
|
||||
watchlist <- list("train" = xgmat)
|
||||
nrounds <- 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist );
|
||||
bst <- xgb.train(param, xgmat, nrounds, watchlist)
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
@ -49,22 +49,21 @@ for (i in 1:length(threads)){
|
||||
|
||||
xgboost.time
|
||||
# [[1]]
|
||||
# user system elapsed
|
||||
# 99.015 0.051 98.982
|
||||
#
|
||||
# user system elapsed
|
||||
# 99.015 0.051 98.982
|
||||
#
|
||||
# [[2]]
|
||||
# user system elapsed
|
||||
# 100.268 0.317 55.473
|
||||
#
|
||||
# user system elapsed
|
||||
# 100.268 0.317 55.473
|
||||
#
|
||||
# [[3]]
|
||||
# user system elapsed
|
||||
# 111.682 0.777 35.963
|
||||
#
|
||||
# user system elapsed
|
||||
# 111.682 0.777 35.963
|
||||
#
|
||||
# [[4]]
|
||||
# user system elapsed
|
||||
# 149.396 1.851 32.661
|
||||
#
|
||||
# user system elapsed
|
||||
# 149.396 1.851 32.661
|
||||
#
|
||||
# [[5]]
|
||||
# user system elapsed
|
||||
# 157.390 5.988 40.949
|
||||
|
||||
# user system elapsed
|
||||
# 157.390 5.988 40.949
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||
test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||
train <- train[,-1]
|
||||
test <- test[,-1]
|
||||
train <- read.csv('data/train.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
test <- read.csv('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
train <- train[, -1]
|
||||
test <- test[, -1]
|
||||
|
||||
y <- train[,ncol(train)]
|
||||
y <- gsub('Class_','',y)
|
||||
y <- as.integer(y)-1 # xgboost take features in [0,numOfClass)
|
||||
y <- train[, ncol(train)]
|
||||
y <- gsub('Class_', '', y)
|
||||
y <- as.integer(y) - 1 # xgboost take features in [0,numOfClass)
|
||||
|
||||
x <- rbind(train[,-ncol(train)],test)
|
||||
x <- rbind(train[, -ncol(train)], test)
|
||||
x <- as.matrix(x)
|
||||
x <- matrix(as.numeric(x),nrow(x),ncol(x))
|
||||
x <- matrix(as.numeric(x), nrow(x), ncol(x))
|
||||
trind <- 1:length(y)
|
||||
teind <- (nrow(train)+1):nrow(x)
|
||||
teind <- (nrow(train) + 1):nrow(x)
|
||||
|
||||
# Set necessary parameter
|
||||
param <- list("objective" = "multi:softprob",
|
||||
@ -24,20 +24,25 @@ param <- list("objective" = "multi:softprob",
|
||||
|
||||
# Run Cross Validation
|
||||
cv.nrounds <- 50
|
||||
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y,
|
||||
nfold = 3, nrounds=cv.nrounds)
|
||||
bst.cv <- xgb.cv(
|
||||
param = param
|
||||
, data = x[trind, ]
|
||||
, label = y
|
||||
, nfold = 3
|
||||
, nrounds = cv.nrounds
|
||||
)
|
||||
|
||||
# Train the model
|
||||
nrounds <- 50
|
||||
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds)
|
||||
bst <- xgboost(param = param, data = x[trind, ], label = y, nrounds = nrounds)
|
||||
|
||||
# Make prediction
|
||||
pred <- predict(bst,x[teind,])
|
||||
pred <- matrix(pred,9,length(pred)/9)
|
||||
pred <- predict(bst, x[teind, ])
|
||||
pred <- matrix(pred, 9, length(pred) / 9)
|
||||
pred <- t(pred)
|
||||
|
||||
# Output submission
|
||||
pred <- format(pred, digits=2,scientific=F) # shrink the size of submission
|
||||
pred <- data.frame(1:nrow(pred),pred)
|
||||
names(pred) <- c('id', paste0('Class_',1:9))
|
||||
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)
|
||||
pred <- format(pred, digits = 2, scientific = FALSE) # shrink the size of submission
|
||||
pred <- data.frame(1:nrow(pred), pred)
|
||||
names(pred) <- c('id', paste0('Class_', 1:9))
|
||||
write.csv(pred, file = 'submission.csv', quote = FALSE, row.names = FALSE)
|
||||
|
||||
@ -31,7 +31,7 @@ require(methods)
|
||||
require(data.table)
|
||||
require(magrittr)
|
||||
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
|
||||
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
|
||||
test <- fread('data/test.csv', header = TRUE, stringsAsFactors = FALSE)
|
||||
```
|
||||
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
||||
|
||||
@ -42,13 +42,13 @@ Let's explore the dataset.
|
||||
dim(train)
|
||||
|
||||
# Training content
|
||||
train[1:6,1:5, with =FALSE]
|
||||
train[1:6, 1:5, with = FALSE]
|
||||
|
||||
# Test dataset dimensions
|
||||
dim(test)
|
||||
|
||||
# Test content
|
||||
test[1:6,1:5, with =FALSE]
|
||||
test[1:6, 1:5, with = FALSE]
|
||||
```
|
||||
> We only display the 6 first rows and 5 first columns for convenience
|
||||
|
||||
@ -87,7 +87,7 @@ For that purpose, we will:
|
||||
```{r classToIntegers}
|
||||
# Convert from classes to numbers
|
||||
y <- train[, nameLastCol, with = FALSE][[1]] %>%
|
||||
gsub('Class_','',.) %>%
|
||||
gsub('Class_', '', .) %>%
|
||||
as.integer %>%
|
||||
subtract(., 1)
|
||||
|
||||
@ -98,14 +98,14 @@ y[1:5]
|
||||
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
||||
|
||||
```{r deleteCols, results='hide'}
|
||||
train[, nameLastCol:=NULL, with = FALSE]
|
||||
train[, nameLastCol := NULL, with = FALSE]
|
||||
```
|
||||
|
||||
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
||||
|
||||
```{r convertToNumericMatrix}
|
||||
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
|
||||
testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
|
||||
trainMatrix <- train[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||
testMatrix <- test[, lapply(.SD, as.numeric)] %>% as.matrix
|
||||
```
|
||||
|
||||
Model training
|
||||
@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob",
|
||||
cv.nrounds <- 5
|
||||
cv.nfold <- 3
|
||||
|
||||
bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y,
|
||||
bst.cv <- xgb.cv(param = param, data = trainMatrix, label = y,
|
||||
nfold = cv.nfold, nrounds = cv.nrounds)
|
||||
```
|
||||
> As we can see the error rate is low on the test dataset (for a 5mn trained model).
|
||||
@ -136,7 +136,7 @@ Finally, we are ready to train the real model!!!
|
||||
|
||||
```{r modelTraining}
|
||||
nrounds <- 50
|
||||
bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds)
|
||||
bst <- xgboost(param = param, data = trainMatrix, label = y, nrounds = nrounds)
|
||||
```
|
||||
|
||||
Model understanding
|
||||
@ -189,7 +189,7 @@ names <- dimnames(trainMatrix)[[2]]
|
||||
importance_matrix <- xgb.importance(names, model = bst)
|
||||
|
||||
# Nice graph
|
||||
xgb.plot.importance(importance_matrix[1:10,])
|
||||
xgb.plot.importance(importance_matrix[1:10, ])
|
||||
```
|
||||
|
||||
> To make it understandable we first extract the column names from the `Matrix`.
|
||||
|
||||
@ -16,8 +16,8 @@ df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),
|
||||
|
||||
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
|
||||
|
||||
train <- df[idx,]
|
||||
test <- df[-idx,]
|
||||
train <- df[idx, ]
|
||||
test <- df[-idx, ]
|
||||
|
||||
train_x <- train[, 1:34]
|
||||
train_y <- train[, V35]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user