ok

2014-09-06 11:13:19 -07:00
parent 7879db8702
commit e9ed4eb1a2
17 changed files with 135 additions and 283 deletions
--- a/demo/guide-R/README.md
+++ b/demo/guide-R/README.md
@@ -1,3 +0,0 @@
-XGBoost R Feature Walkthrough
-====
-To be finished
--- a/demo/guide-R/basic_walkthrough.R
+++ b/demo/guide-R/basic_walkthrough.R
@@ -1,47 +0,0 @@
-require(xgboost)
-require(methods)
-data(agaricus.train)
-data(agaricus.test)
-
-# we use agaricus data as example dataset
-# we will show how to use xgboost to do binary classification here
-
-trainX = agaricus.train$data
-trainY = agaricus.train$label
-testX = agaricus.test$data
-testY = agaricus.test$label
-#-------------------------------------
-# this is the basic usage of xgboost
-# you can put sparse matrix in data field. this is helpful when your data is sparse
-# for example, when you use one-hot encoding for feature vectors
-bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
-               objective = "binary:logistic")
-# alternatively, you can put dense matrix
-denseX <- as(trainX, "matrix")
-bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
-               objective = "binary:logistic")
-
-# you can also specify data as file path to a LibSVM format input
-# since we do not have libsvm format file for iris, next line is only for illustration
-# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
-
-dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(testX, label=testY)
-
-
-param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
-watchlist <- list(eval = dtest, train = dtrain)
-num_round <- 2
-bst <- xgb.train(param, dtrain, num_round, watchlist)
-preds <- predict(bst, dtest)
-labels <- getinfo(dtest,'label')
-cat('error=', mean(as.numeric(preds>0.5)!=labels),'\n')
-xgb.save(bst, 'xgb.model')
-xgb.dump(bst, 'dump.raw.txt')
-xgb.dump(bst, 'dump.nuce.txt','../data/featmap.txt')
-
-bst2 <- xgb.load('xgb.model')
-preds2 <- predict(bst2,dtest)
-stopifnot(sum((preds-preds2)^2)==0)
-
-############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
--- a/demo/guide-R/boost_from_prediction.R
+++ b/demo/guide-R/boost_from_prediction.R
@@ -1,31 +0,0 @@
-require(xgboost)
-
-data(agaricus.train)
-data(agaricus.test)
-
-trainX = agaricus.train$data
-trainY = agaricus.train$label
-testX = agaricus.test$data
-testY = agaricus.test$label
-
-dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(testX, label=testY)
-
-
-watchlist <- list(eval = dtest, train = dtrain)
-print('start running example to start from a initial prediction\n')
-param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
-bst <- xgb.train( param, dtrain, 1, watchlist )
-
-ptrain <- predict(bst, dtrain, outputmargin=TRUE)
-ptest  <- predict(bst, dtest, outputmargin=TRUE)
-# dtrain.set_base_margin(ptrain)
-# dtest.set_base_margin(ptest)
-
-
-cat('this is result of running from initial prediction\n')
-bst <- xgb.train( param, dtrain, 1, watchlist )
-
-
-
-
--- a/demo/guide-R/cross_validation.R
+++ b/demo/guide-R/cross_validation.R
@@ -1,75 +0,0 @@
-require(xgboost)
-
-data(agaricus.train)
-data(agaricus.test)
-
-trainX = agaricus.train$data
-trainY = agaricus.train$label
-testX = agaricus.test$data
-testY = agaricus.test$label
-
-dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(testX, label=testY)
-
-num_round <- 2
-param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
-
-cat('running cross validation\n')
-# do cross validation, this will print result out as
-# [iteration]  metric_name:mean_value+std_value
-# std_value is standard deviation of the metric
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'error'}, seed = 0)
-
-cat('running cross validation, disable standard deviation display\n')
-# do cross validation, this will print result out as
-# [iteration]  metric_name:mean_value+std_value
-# std_value is standard deviation of the metric
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'error'}, seed = 0, show_stdv = False)
-
-cat('running cross validation, with preprocessing function\n')
-# define the preprocessing function
-# used to return the preprocessed training, test data, and parameter
-# we can use this to do weight rescale, etc.
-# as a example, we try to set scale_pos_weight
-fpreproc <- function(dtrain, dtest, param){
-  label <- getinfo(dtrain, 'label')
-  ratio <- mean(label==0)
-  param <- append(param, list(scale_pos_weight = ratio))
-  return(list(dtrain=dtrain, dtest= dtest, param = param))
-}
-
-
-# do cross validation, for each fold
-# the dtrain, dtest, param will be passed into fpreproc
-# then the return value of fpreproc will be used to generate
-# results of that fold
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
-
-###
-# you can also do cross validation with cutomized loss function
-# See custom_objective.py
-##
-print ('running cross validation, with cutomsized loss function')
-
-logregobj <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  preds <- 1/(1 + exp(-preds))
-  grad <- preds - labels
-  hess <- preds * (1 - preds)
-  return(list(grad = grad, hess = hess))
-}
-
-evalerror <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
-  return(list(metric = "error", value = err))
-}
-
-param <- list(max_depth=2,eta=1,silent=1)
-# train with customized objective
-xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
-       obj = logregobj, feval=evalerror)
-
--- a/demo/guide-R/custom_objective.R
+++ b/demo/guide-R/custom_objective.R
@@ -1,47 +0,0 @@
-require(xgboost)
-
-data(agaricus.train)
-data(agaricus.test)
-
-trainX = agaricus.train$data
-trainY = agaricus.train$label
-testX = agaricus.test$data
-testY = agaricus.test$label
-
-dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(testX, label=testY)
-
-# note: for customized objective function, we leave objective as default
-# note: what we are getting is margin value in prediction
-# you must know what you are doing
-param <- list(max_depth=2,eta=1,silent=1)
-watchlist <- list(eval = dtest, train = dtrain)
-num_round <- 2
-
-
-# user define objective function, given prediction, return gradient and second order gradient
-# this is loglikelihood loss
-logregobj <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  preds <- 1/(1 + exp(-preds))
-  grad <- preds - labels
-  hess <- preds * (1 - preds)
-  return(list(grad = grad, hess = hess))
-}
-
-# user defined evaluation function, return a pair metric_name, result
-# NOTE: when you do customized loss function, the default prediction value is margin
-# this may make buildin evalution metric not function properly
-# for example, we are doing logistic loss, the prediction is score before logistic transformation
-# the buildin evaluation error assumes input is after logistic transformation
-# Take this in mind when you use the customization, and maybe you need write customized evaluation function
-evalerror <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
-  return(list(metric = "error", value = err))
-}
-
-
-# training with customized objective, we can also do step by step training
-# simply look at xgboost.py's implementation of train
-bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
--- a/demo/guide-R/generalized_linear_model.R
+++ b/demo/guide-R/generalized_linear_model.R
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys
-sys.path.append('../../wrapper')
-import xgboost as xgb
-##
-#  this script demonstrate how to fit generalized linear model in xgboost
-#  basically, we are using linear model, instead of tree for our boosters
-##
-dtrain = xgb.DMatrix('../data/agaricus.txt.train')
-dtest = xgb.DMatrix('../data/agaricus.txt.test')
-# change booster to gblinear, so that we are fitting a linear model
-# alpha is the L1 regularizer 
-# lambda is the L2 regularizer
-# you can also set lambda_bias which is L2 regularizer on the bias term
-param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
-         'alpha': 0.0001, 'lambda': 1 }
-
-# normally, you do not need to set eta (step_size)
-# XGBoost uses a parallel coordinate descent algorithm (shotgun), 
-# there could be affection on convergence with parallelization on certain cases
-# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
-# param['eta'] = 1 
-
-##
-# the rest of settings are the same
-##
-watchlist  = [(dtest,'eval'), (dtrain,'train')]
-num_round = 4
-bst = xgb.train(param, dtrain, num_round, watchlist)
-preds = bst.predict(dtest)
-labels = dtest.get_label()
-print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
--- a/demo/guide-R/predict_first_ntree.R
+++ b/demo/guide-R/predict_first_ntree.R
@@ -1,27 +0,0 @@
-require(xgboost)
-
-data(agaricus.train)
-data(agaricus.test)
-
-trainX = agaricus.train$data
-trainY = agaricus.train$label
-testX = agaricus.test$data
-testY = agaricus.test$label
-
-dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(testX, label=testY)
-
-
-param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
-watchlist <- list(eval = dtest, train = dtrain)
-num_round = 2
-bst = xgb.train(param, dtrain, num_round, watchlist)
-
-cat('start testing prediction from first n trees\n')
-labels <- getinfo(dtest,'label')
-ypred1 = predict(bst, dtest, ntreelimit=1)
-ypred2 = predict(bst, dtest)
-
-cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
-cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')
-
--- a/demo/guide-R/runall.sh
+++ b/demo/guide-R/runall.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-# todo 
-Rscript basic_walkthrough.R
-Rscript custom_objective.R
-Rscript boost_from_prediction.R