remove inst/, improve vignette

2014-09-06 23:05:21 -07:00 · 2014-09-06 23:05:21 -07:00 · cd35d88a03
commit cd35d88a03
parent 50d77c72eb
6 changed files with 27 additions and 8437 deletions
--- a/R-package/inst/examples/agaricus.txt.test
+++ b/R-package/inst/examples/agaricus.txt.test
--- a/R-package/inst/examples/agaricus.txt.train
+++ b/R-package/inst/examples/agaricus.txt.train
--- a/R-package/inst/examples/cross_validation.R
+++ b/R-package/inst/examples/cross_validation.R
@ -1,10 +0,0 @@
-require(xgboost)
-require(methods)
-# Directly read in local file
-dtrain <- xgb.DMatrix("agaricus.txt.train")
-
-history <- xgb.cv( data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
-                  "max_depth"=3, "eta"=1,
-                  "objective"="binary:logistic")
-
-
--- a/R-package/inst/examples/demo.R
+++ b/R-package/inst/examples/demo.R
@ -1,153 +0,0 @@
-require(xgboost)
-require(methods)
-
-# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
-# use this only for demo purpose adopted from
-# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
-read.libsvm <- function(fname, maxcol) {
-  content <- readLines(fname)
-  nline <- length(content)
-  label <- numeric(nline)
-  mat <- matrix(0, nline, maxcol + 1)
-  for (i in 1:nline) {
-    arr <- as.vector(strsplit(content[i], " ")[[1]])
-    label[i] <- as.numeric(arr[[1]])
-    for (j in 2:length(arr)) {
-      kv <- strsplit(arr[j], ":")[[1]]
-      # to avoid 0 index
-      findex <- as.integer(kv[1]) + 1
-      fvalue <- as.numeric(kv[2])
-      mat[i, findex] <- fvalue
-    }
-  }
-  mat <- as(mat, "sparseMatrix")
-  return(list(label = label, data = mat))
-}
-
-############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
-
-# Directly read in local file
-dtrain <- xgb.DMatrix("agaricus.txt.train")
-class(dtrain)
-
-# read file in R
-csc <- read.libsvm("agaricus.txt.train", 126)
-y <- csc$label
-x <- csc$data
-
-# x as Sparse Matrix
-class(x)
-dtrain <- xgb.DMatrix(x, label = y)
-
-# x as dense matrix
-dense.x <- as.matrix(x)
-dtrain <- xgb.DMatrix(dense.x, label = y)
-
-############################ Test xgboost with local file, sparse matrix and dense matrix in R.
-
-# Test with DMatrix object
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic")
-
-# Verbose = 0,1,2
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 0)
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 1)
-bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic", verbose = 2)
-
-# Test with local file
-bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
-               objective = "binary:logistic")
-
-# Test with Sparse Matrix
-bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic")
-
-# Test with dense Matrix
-bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
-               objective = "binary:logistic")
-
-
-############################ Test predict
-
-# Prediction with DMatrix object
-dtest <- xgb.DMatrix("agaricus.txt.test")
-pred <- predict(bst, dtest)
-
-# Prediction with local test file
-pred <- predict(bst, "agaricus.txt.test")
-
-# Prediction with Sparse Matrix
-csc <- read.libsvm("agaricus.txt.test", 126)
-test.y <- csc$label
-test.x <- csc$data
-pred <- predict(bst, test.x)
-
-# Extrac label with getinfo
-labels <- getinfo(dtest, "label")
-err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
-print(paste("error=", err))
-
-############################ Save and load model to hard disk
-
-# save model to binary local file
-xgb.save(bst, "xgboost.model")
-
-# load binary model to R
-bst <- xgb.load("xgboost.model")
-pred <- predict(bst, test.x)
-
-# save model to text file
-xgb.dump(bst, "dump.raw.txt")
-# save model to text file, with feature map
-xgb.dump(bst, "dump.nice.txt", "featmap.txt")
-
-# save a DMatrix object to hard disk
-xgb.DMatrix.save(dtrain, "dtrain.buffer")
-
-# load a DMatrix object to R
-dtrain <- xgb.DMatrix("dtrain.buffer")
-
-############################ More flexible training function xgb.train
-
-param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
-watchlist <- list(eval = dtest, train = dtrain)
-
-# training xgboost model
-bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
-
-############################ cutomsized loss function
-
-param <- list(max_depth = 2, eta = 1, silent = 1)
-
-# note: for customized objective function, we leave objective as default note: what we are getting is
-# margin value in prediction you must know what you are doing
-
-# user define objective function, given prediction, return gradient and second order gradient this is
-# loglikelihood loss
-logregobj <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  preds <- 1/(1 + exp(-preds))
-  grad <- preds - labels
-  hess <- preds * (1 - preds)
-  return(list(grad = grad, hess = hess))
-}
-# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
-# you do customized loss function, the default prediction value is margin this may make buildin
-# evalution metric not function properly for example, we are doing logistic loss, the prediction is
-# score before logistic transformation the buildin evaluation error assumes input is after logistic
-# transformation Take this in mind when you use the customization, and maybe you need write customized
-# evaluation function
-evalerror <- function(preds, dtrain) {
-  labels <- getinfo(dtrain, "label")
-  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
-  return(list(metric = "error", value = err))
-}
-
-# training with customized objective, we can also do step by step training simply look at xgboost.py's
-# implementation of train
-bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
-
- 
--- a/R-package/inst/examples/featmap.txt
+++ b/R-package/inst/examples/featmap.txt
@ -1,126 +0,0 @@
-0	cap-shape=bell	i
-1	cap-shape=conical	i
-2	cap-shape=convex	i
-3	cap-shape=flat	i
-4	cap-shape=knobbed	i
-5	cap-shape=sunken	i
-6	cap-surface=fibrous	i
-7	cap-surface=grooves	i
-8	cap-surface=scaly	i
-9	cap-surface=smooth	i
-10	cap-color=brown	i
-11	cap-color=buff	i
-12	cap-color=cinnamon	i
-13	cap-color=gray	i
-14	cap-color=green	i
-15	cap-color=pink	i
-16	cap-color=purple	i
-17	cap-color=red	i
-18	cap-color=white	i
-19	cap-color=yellow	i
-20	bruises?=bruises	i
-21	bruises?=no	i
-22	odor=almond	i
-23	odor=anise	i
-24	odor=creosote	i
-25	odor=fishy	i
-26	odor=foul	i
-27	odor=musty	i
-28	odor=none	i
-29	odor=pungent	i
-30	odor=spicy	i
-31	gill-attachment=attached	i
-32	gill-attachment=descending	i
-33	gill-attachment=free	i
-34	gill-attachment=notched	i
-35	gill-spacing=close	i
-36	gill-spacing=crowded	i
-37	gill-spacing=distant	i
-38	gill-size=broad	i
-39	gill-size=narrow	i
-40	gill-color=black	i
-41	gill-color=brown	i
-42	gill-color=buff	i
-43	gill-color=chocolate	i
-44	gill-color=gray	i
-45	gill-color=green	i
-46	gill-color=orange	i
-47	gill-color=pink	i
-48	gill-color=purple	i
-49	gill-color=red	i
-50	gill-color=white	i
-51	gill-color=yellow	i
-52	stalk-shape=enlarging	i
-53	stalk-shape=tapering	i
-54	stalk-root=bulbous	i
-55	stalk-root=club	i
-56	stalk-root=cup	i
-57	stalk-root=equal	i
-58	stalk-root=rhizomorphs	i
-59	stalk-root=rooted	i
-60	stalk-root=missing	i
-61	stalk-surface-above-ring=fibrous	i
-62	stalk-surface-above-ring=scaly	i
-63	stalk-surface-above-ring=silky	i
-64	stalk-surface-above-ring=smooth	i
-65	stalk-surface-below-ring=fibrous	i
-66	stalk-surface-below-ring=scaly	i
-67	stalk-surface-below-ring=silky	i
-68	stalk-surface-below-ring=smooth	i
-69	stalk-color-above-ring=brown	i
-70	stalk-color-above-ring=buff	i
-71	stalk-color-above-ring=cinnamon	i
-72	stalk-color-above-ring=gray	i
-73	stalk-color-above-ring=orange	i
-74	stalk-color-above-ring=pink	i
-75	stalk-color-above-ring=red	i
-76	stalk-color-above-ring=white	i
-77	stalk-color-above-ring=yellow	i
-78	stalk-color-below-ring=brown	i
-79	stalk-color-below-ring=buff	i
-80	stalk-color-below-ring=cinnamon	i
-81	stalk-color-below-ring=gray	i
-82	stalk-color-below-ring=orange	i
-83	stalk-color-below-ring=pink	i
-84	stalk-color-below-ring=red	i
-85	stalk-color-below-ring=white	i
-86	stalk-color-below-ring=yellow	i
-87	veil-type=partial	i
-88	veil-type=universal	i
-89	veil-color=brown	i
-90	veil-color=orange	i
-91	veil-color=white	i
-92	veil-color=yellow	i
-93	ring-number=none	i
-94	ring-number=one	i
-95	ring-number=two	i
-96	ring-type=cobwebby	i
-97	ring-type=evanescent	i
-98	ring-type=flaring	i
-99	ring-type=large	i
-100	ring-type=none	i
-101	ring-type=pendant	i
-102	ring-type=sheathing	i
-103	ring-type=zone	i
-104	spore-print-color=black	i
-105	spore-print-color=brown	i
-106	spore-print-color=buff	i
-107	spore-print-color=chocolate	i
-108	spore-print-color=green	i
-109	spore-print-color=orange	i
-110	spore-print-color=purple	i
-111	spore-print-color=white	i
-112	spore-print-color=yellow	i
-113	population=abundant	i
-114	population=clustered	i
-115	population=numerous	i
-116	population=scattered	i
-117	population=several	i
-118	population=solitary	i
-119	habitat=grasses	i
-120	habitat=leaves	i
-121	habitat=meadows	i
-122	habitat=paths	i
-123	habitat=urban	i
-124	habitat=waste	i
-125	habitat=woods	i
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@ -80,12 +80,15 @@ Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichma

 <<Training and prediction with iris>>=
 library(xgboost)
-data(iris)
-bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), 
-               nrounds = 5)
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train <- agaricus.train
+test <- agaricus.test
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, 
+               nround = 2, objective = "binary:logistic")
 xgb.save(bst, 'model.save')
 bst = xgb.load('model.save')
-pred <- predict(bst, as.matrix(iris[,1:4]))
+pred <- predict(bst, test$data)
@

 \verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
@ -102,17 +105,19 @@ The output looks like

 \begin{verbatim}
 booster[0]:
-0:[f2<2.45] yes=1,no=2,missing=1
-    1:leaf=0.147059
-    2:[f3<1.65] yes=3,no=4,missing=3
-        3:leaf=0.464151
-        4:leaf=0.722449
+0:[f28<1.00001] yes=1,no=2,missing=2
+  1:[f108<1.00001] yes=3,no=4,missing=4
+    3:leaf=1.85965
+    4:leaf=-1.94071
+  2:[f55<1.00001] yes=5,no=6,missing=6
+    5:leaf=-1.70044
+    6:leaf=1.71218
 booster[1]:
-0:[f2<2.45] yes=1,no=2,missing=1
-    1:leaf=0.103806
-    2:[f2<4.85] yes=3,no=4,missing=3
-        3:leaf=0.316341
-        4:leaf=0.510365
+0:[f59<1.00001] yes=1,no=2,missing=2
+  1:leaf=-6.23624
+  2:[f28<1.00001] yes=3,no=4,missing=4
+    3:leaf=-0.96853
+    4:leaf=0.784718
 \end{verbatim}

 It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
@ -121,18 +126,16 @@ training from initial prediction value, weighted training instance.

 We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
 <<xgb.DMatrix>>=
-iris.mat <- as.matrix(iris[,1:4])
-iris.label <- as.numeric(iris[,5]=='setosa')
-diris <- xgb.DMatrix(iris.mat, label = iris.label)
-class(diris)
-getinfo(diris,'label')
+dtrain <- xgb.DMatrix(train$data, label = train$label)
+class(dtrain)
+head(getinfo(dtrain,'label'))
@

 We can also save the matrix to a binary file. Then load it simply with 
 \verb@xgb.DMatrix@
 <<save model>>=
-xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
-diris = xgb.DMatrix('iris.xgb.DMatrix')
+xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
+dtrain = xgb.DMatrix('xgb.DMatrix')
@

 \section{Advanced Examples}
@ -157,11 +160,11 @@ evalerror <- function(preds, dtrain) {
  return(list(metric = "MSE", value = err))
 }

-dtest <- slice(diris,1:100)
-watchlist <- list(eval = dtest, train = diris)
+dtest <- xgb.DMatrix(test$data, label = test$label)
+watchlist <- list(eval = dtest, train = dtrain)
 param <- list(max_depth = 2, eta = 1, silent = 1)

-bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
+bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
@

 The gradient and second order gradient is required for the output of customized