From ed9d8a1c0e42e360fca9fb9dd7c322e42492e0b6 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 23 Aug 2014 20:52:56 -0700 Subject: [PATCH 1/3] add higgs example --- demo/kaggle-higgs/README.md | 6 ++++++ demo/kaggle-higgs/higgs-numpy.py | 2 +- demo/kaggle-higgs/higgs-pred.R | 23 +++++++++++++++++++++++ demo/kaggle-higgs/higgs-pred.py | 3 +-- demo/kaggle-higgs/higgs-train.R | 31 +++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 demo/kaggle-higgs/higgs-pred.R create mode 100644 demo/kaggle-higgs/higgs-train.R diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index 9e535ef1e..c04b65389 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -17,3 +17,9 @@ make Speed ===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM + + +Using R module +===== +* Alternatively, you can run using R, higgs-train.R and higgs-pred.R + diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index bd60f074f..87d323914 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -53,7 +53,7 @@ plst = list(param.items())+[('eval_metric', 'ams@0.15')] watchlist = [ (xgmat,'train') ] # boost 120 tres -num_round = 120 +num_round = 2 print ('loading data end, start to boost trees') bst = xgb.train( plst, xgmat, num_round, watchlist ); # save out model diff --git a/demo/kaggle-higgs/higgs-pred.R b/demo/kaggle-higgs/higgs-pred.R new file mode 100644 index 000000000..a236c2adf --- /dev/null +++ b/demo/kaggle-higgs/higgs-pred.R @@ -0,0 +1,23 @@ +# include xgboost library, must set chdir=TRURE +source("../../wrapper/xgboost.R", chdir=TRUE) + +modelfile <- "higgs.model" +outfile <- "higgs.pred.csv" +dtest <- read.csv("data/test.csv", header=TRUE) +data <- as.matrix(dtest[2:31]) +idx <- dtest[[1]] + +xgmat <- xgb.DMatrix(data, missing = -999.0) +bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile) +ypred <- xgb.predict(bst, xgmat) + +rorder <- rank(ypred, ties.method="first") + +threshold <- 0.15 +# to be completed +ntop <- length(rorder) - as.integer(threshold*length(rorder)) +plabel <- ifelse(rorder > ntop, "s", "b") +outdata <- list("EventId" = idx, + "RankOrder" = rorder, + "Class" = plabel) +write.csv(outdata, file = outfile, quote=FALSE, row.names=FALSE) diff --git a/demo/kaggle-higgs/higgs-pred.py b/demo/kaggle-higgs/higgs-pred.py index 15dd293dd..e5383f89d 100755 --- a/demo/kaggle-higgs/higgs-pred.py +++ b/demo/kaggle-higgs/higgs-pred.py @@ -21,8 +21,7 @@ idx = dtest[:,0] print ('finish loading from csv ') xgmat = xgb.DMatrix( data, missing = -999.0 ) -bst = xgb.Booster({'nthread':16}) -bst.load_model( modelfile ) +bst = xgb.Booster({'nthread':16}, model_file = modelfile) ypred = bst.predict( xgmat ) res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ] diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R new file mode 100644 index 000000000..f5a45fe1f --- /dev/null +++ b/demo/kaggle-higgs/higgs-train.R @@ -0,0 +1,31 @@ +# include xgboost library, must set chdir=TRURE +source("../../wrapper/xgboost.R", chdir=TRUE) +testsize <- 550000 + +dtrain <- read.csv("data/training.csv", header=TRUE) +dtrain[33] <- dtrain[33] == "s" +label <- as.numeric(dtrain[[33]]) +data <- as.matrix(dtrain[2:31]) +weight <- as.numeric(dtrain[[32]]) * testsize / length(label) + +sumwpos <- sum(weight * (label==1.0)) +sumwneg <- sum(weight * (label==0.0)) +print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) + +xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0) +param <- list("objective" = "binary:logitraw", + "scale_pos_weight" = sumwneg / sumwpos, + "bst:eta" = 0.1, + "bst:max_depth" = 6, + "eval_metric" = "auc", + "eval_metric" = "ams@0.15", + "silent" = 1, + "nthread" = 16) +watchlist <- list("train" = xgmat) +nround = 120 +print ("loading data end, start to boost trees") +bst = xgb.train(param, xgmat, nround, watchlist ); +# save out model +xgb.save(bst, "higgs.model") +print ('finish training') + From d16a56814b862144fde616707bcc1f156ba84155 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 23 Aug 2014 20:53:16 -0700 Subject: [PATCH 2/3] remove pred.csv From fea7245fa03f8500b0198108c657f34bba1819f6 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 23 Aug 2014 20:53:56 -0700 Subject: [PATCH 3/3] chg python back --- demo/kaggle-higgs/higgs-numpy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index 87d323914..bd60f074f 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -53,7 +53,7 @@ plst = list(param.items())+[('eval_metric', 'ams@0.15')] watchlist = [ (xgmat,'train') ] # boost 120 tres -num_round = 2 +num_round = 120 print ('loading data end, start to boost trees') bst = xgb.train( plst, xgmat, num_round, watchlist ); # save out model