From 39fa45debe3fa768d79b077833b5045e139d3a42 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 4 Dec 2015 15:16:58 +0100 Subject: [PATCH] Add code to demo of leaf (show imprmt in accuracy) --- R-package/demo/predict_leaf_indices.R | 45 ++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 110bf9602..6cde561c2 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -1,4 +1,9 @@ require(xgboost) +require(data.table) +require(Matrix) + +set.seed(1982) + # load in the agaricus dataset data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') @@ -6,16 +11,42 @@ dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') -watchlist <- list(eval = dtest, train = dtrain) -nround = 5 +nround = 4 # training the model for two rounds -bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2, watchlist = watchlist) -cat('start testing prediction from first n trees\n') +bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2) + +# Model accuracy without new features +accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) -### predict using first 2 tree -pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE) -head(pred_with_leaf) # by default, we predict using all the trees + pred_with_leaf = predict(bst, dtest, predleaf = TRUE) head(pred_with_leaf) + +create.new.tree.features <- function(model, original.features){ + pred_with_leaf = predict(model, original.features, predleaf = TRUE) + cols <- list() + for(i in 1:length(trees)){ + # max is not the real max but it s not important for the purpose of adding features + max <- max(pred_with_leaf[,i]) + cols[[i]] <- factor(x = pred_with_leaf[,i], level = seq(to = max)) + } + cBind(original.features, sparse.model.matrix( ~ ., as.data.frame(cols))) +} + +# Convert previous features to one hot encoding +new.features.train <- create.new.tree.features(bst, agaricus.train$data) +new.features.test <- create.new.tree.features(bst, agaricus.test$data) + +# learning with new features +new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) +new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) +watchlist <- list(train = new.dtrain) +bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2) + +# Model accuracy with new features +accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) + +# Here the accuracy was already good and is now perfect. +print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!"))