diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index 9e535ef1e..c04b65389 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -17,3 +17,9 @@ make Speed ===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM + + +Using R module +===== +* Alternatively, you can run using R, higgs-train.R and higgs-pred.R + diff --git a/demo/kaggle-higgs/higgs-pred.R b/demo/kaggle-higgs/higgs-pred.R new file mode 100644 index 000000000..a236c2adf --- /dev/null +++ b/demo/kaggle-higgs/higgs-pred.R @@ -0,0 +1,23 @@ +# include xgboost library, must set chdir=TRURE +source("../../wrapper/xgboost.R", chdir=TRUE) + +modelfile <- "higgs.model" +outfile <- "higgs.pred.csv" +dtest <- read.csv("data/test.csv", header=TRUE) +data <- as.matrix(dtest[2:31]) +idx <- dtest[[1]] + +xgmat <- xgb.DMatrix(data, missing = -999.0) +bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile) +ypred <- xgb.predict(bst, xgmat) + +rorder <- rank(ypred, ties.method="first") + +threshold <- 0.15 +# to be completed +ntop <- length(rorder) - as.integer(threshold*length(rorder)) +plabel <- ifelse(rorder > ntop, "s", "b") +outdata <- list("EventId" = idx, + "RankOrder" = rorder, + "Class" = plabel) +write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE) diff --git a/demo/kaggle-higgs/higgs-pred.py b/demo/kaggle-higgs/higgs-pred.py index 15dd293dd..e5383f89d 100755 --- a/demo/kaggle-higgs/higgs-pred.py +++ b/demo/kaggle-higgs/higgs-pred.py @@ -21,8 +21,7 @@ idx = dtest[:,0] print ('finish loading from csv ') xgmat = xgb.DMatrix( data, missing = -999.0 ) -bst = xgb.Booster({'nthread':16}) -bst.load_model( modelfile ) +bst = xgb.Booster({'nthread':16}, model_file = modelfile) ypred = bst.predict( xgmat ) res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ] diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R new file mode 100644 index 000000000..f5a45fe1f --- /dev/null +++ b/demo/kaggle-higgs/higgs-train.R @@ -0,0 +1,31 @@ +# include xgboost library, must set chdir=TRURE +source("../../wrapper/xgboost.R", chdir=TRUE) +testsize <- 550000 + +dtrain <- read.csv("data/training.csv", header=TRUE) +dtrain[33] <- dtrain[33] == "s" +label <- as.numeric(dtrain[[33]]) +data <- as.matrix(dtrain[2:31]) +weight <- as.numeric(dtrain[[32]]) * testsize / length(label) + +sumwpos <- sum(weight * (label==1.0)) +sumwneg <- sum(weight * (label==0.0)) +print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) + +xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0) +param <- list("objective" = "binary:logitraw", + "scale_pos_weight" = sumwneg / sumwpos, + "bst:eta" = 0.1, + "bst:max_depth" = 6, + "eval_metric" = "auc", + "eval_metric" = "ams@0.15", + "silent" = 1, + "nthread" = 16) +watchlist <- list("train" = xgmat) +nround = 120 +print ("loading data end, start to boost trees") +bst = xgb.train(param, xgmat, nround, watchlist ); +# save out model +xgb.save(bst, "higgs.model") +print ('finish training') +