commit
cf274e76f4
@ -17,3 +17,9 @@ make
|
|||||||
Speed
|
Speed
|
||||||
=====
|
=====
|
||||||
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
||||||
|
|
||||||
|
|
||||||
|
Using R module
|
||||||
|
=====
|
||||||
|
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R
|
||||||
|
|
||||||
|
|||||||
23
demo/kaggle-higgs/higgs-pred.R
Normal file
23
demo/kaggle-higgs/higgs-pred.R
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# include xgboost library, must set chdir=TRURE
|
||||||
|
source("../../wrapper/xgboost.R", chdir=TRUE)
|
||||||
|
|
||||||
|
modelfile <- "higgs.model"
|
||||||
|
outfile <- "higgs.pred.csv"
|
||||||
|
dtest <- read.csv("data/test.csv", header=TRUE)
|
||||||
|
data <- as.matrix(dtest[2:31])
|
||||||
|
idx <- dtest[[1]]
|
||||||
|
|
||||||
|
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||||
|
bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile)
|
||||||
|
ypred <- xgb.predict(bst, xgmat)
|
||||||
|
|
||||||
|
rorder <- rank(ypred, ties.method="first")
|
||||||
|
|
||||||
|
threshold <- 0.15
|
||||||
|
# to be completed
|
||||||
|
ntop <- length(rorder) - as.integer(threshold*length(rorder))
|
||||||
|
plabel <- ifelse(rorder > ntop, "s", "b")
|
||||||
|
outdata <- list("EventId" = idx,
|
||||||
|
"RankOrder" = rorder,
|
||||||
|
"Class" = plabel)
|
||||||
|
write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE)
|
||||||
@ -21,8 +21,7 @@ idx = dtest[:,0]
|
|||||||
|
|
||||||
print ('finish loading from csv ')
|
print ('finish loading from csv ')
|
||||||
xgmat = xgb.DMatrix( data, missing = -999.0 )
|
xgmat = xgb.DMatrix( data, missing = -999.0 )
|
||||||
bst = xgb.Booster({'nthread':16})
|
bst = xgb.Booster({'nthread':16}, model_file = modelfile)
|
||||||
bst.load_model( modelfile )
|
|
||||||
ypred = bst.predict( xgmat )
|
ypred = bst.predict( xgmat )
|
||||||
|
|
||||||
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
|
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
|
||||||
|
|||||||
31
demo/kaggle-higgs/higgs-train.R
Normal file
31
demo/kaggle-higgs/higgs-train.R
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# include xgboost library, must set chdir=TRURE
|
||||||
|
source("../../wrapper/xgboost.R", chdir=TRUE)
|
||||||
|
testsize <- 550000
|
||||||
|
|
||||||
|
dtrain <- read.csv("data/training.csv", header=TRUE)
|
||||||
|
dtrain[33] <- dtrain[33] == "s"
|
||||||
|
label <- as.numeric(dtrain[[33]])
|
||||||
|
data <- as.matrix(dtrain[2:31])
|
||||||
|
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||||
|
|
||||||
|
sumwpos <- sum(weight * (label==1.0))
|
||||||
|
sumwneg <- sum(weight * (label==0.0))
|
||||||
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||||
|
|
||||||
|
xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0)
|
||||||
|
param <- list("objective" = "binary:logitraw",
|
||||||
|
"scale_pos_weight" = sumwneg / sumwpos,
|
||||||
|
"bst:eta" = 0.1,
|
||||||
|
"bst:max_depth" = 6,
|
||||||
|
"eval_metric" = "auc",
|
||||||
|
"eval_metric" = "ams@0.15",
|
||||||
|
"silent" = 1,
|
||||||
|
"nthread" = 16)
|
||||||
|
watchlist <- list("train" = xgmat)
|
||||||
|
nround = 120
|
||||||
|
print ("loading data end, start to boost trees")
|
||||||
|
bst = xgb.train(param, xgmat, nround, watchlist );
|
||||||
|
# save out model
|
||||||
|
xgb.save(bst, "higgs.model")
|
||||||
|
print ('finish training')
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user