71 lines
1.9 KiB
R
71 lines
1.9 KiB
R
# install xgboost package, see R-package in root folder
|
|
require(xgboost)
|
|
require(gbm)
|
|
require(methods)
|
|
|
|
testsize <- 550000
|
|
|
|
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
|
dtrain$Label = as.numeric(dtrain$Label=='s')
|
|
# gbm.time = system.time({
|
|
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
|
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
|
# verbose = TRUE)
|
|
# })
|
|
# print(gbm.time)
|
|
# Test result: 761.48 secs
|
|
|
|
# dtrain[33] <- dtrain[33] == "s"
|
|
# label <- as.numeric(dtrain[[33]])
|
|
data <- as.matrix(dtrain[2:31])
|
|
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
|
|
|
sumwpos <- sum(weight * (label==1.0))
|
|
sumwneg <- sum(weight * (label==0.0))
|
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
|
|
|
xgboost.time = list()
|
|
threads = c(1,2,4,8,16)
|
|
for (i in 1:length(threads)){
|
|
thread = threads[i]
|
|
xgboost.time[[i]] = system.time({
|
|
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
|
param <- list("objective" = "binary:logitraw",
|
|
"scale_pos_weight" = sumwneg / sumwpos,
|
|
"bst:eta" = 0.1,
|
|
"bst:max_depth" = 6,
|
|
"eval_metric" = "auc",
|
|
"eval_metric" = "ams@0.15",
|
|
"nthread" = thread)
|
|
watchlist <- list("train" = xgmat)
|
|
nrounds = 120
|
|
print ("loading data end, start to boost trees")
|
|
bst = xgb.train(param, xgmat, nrounds, watchlist );
|
|
# save out model
|
|
xgb.save(bst, "higgs.model")
|
|
print ('finish training')
|
|
})
|
|
}
|
|
|
|
xgboost.time
|
|
# [[1]]
|
|
# user system elapsed
|
|
# 99.015 0.051 98.982
|
|
#
|
|
# [[2]]
|
|
# user system elapsed
|
|
# 100.268 0.317 55.473
|
|
#
|
|
# [[3]]
|
|
# user system elapsed
|
|
# 111.682 0.777 35.963
|
|
#
|
|
# [[4]]
|
|
# user system elapsed
|
|
# 149.396 1.851 32.661
|
|
#
|
|
# [[5]]
|
|
# user system elapsed
|
|
# 157.390 5.988 40.949
|
|
|