remove inst/, improve vignette

2014-09-06 23:05:21 -07:00 · 2014-09-06 23:05:21 -07:00 · cd35d88a03
commit cd35d88a03
parent 50d77c72eb
6 changed files with 27 additions and 8437 deletions
--- a/R-package/inst/examples/agaricus.txt.test
+++ b/R-package/inst/examples/agaricus.txt.test
--- a/R-package/inst/examples/agaricus.txt.train
+++ b/R-package/inst/examples/agaricus.txt.train
--- a/R-package/inst/examples/cross_validation.R
+++ b/R-package/inst/examples/cross_validation.R
@ -1,10 +0,0 @@
 require(xgboost)
 require(methods)
 # Directly read in local file
 dtrain <- xgb.DMatrix("agaricus.txt.train")
 history <- xgb.cv( data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
                  "max_depth"=3, "eta"=1,
                  "objective"="binary:logistic")
--- a/R-package/inst/examples/demo.R
+++ b/R-package/inst/examples/demo.R
@ -1,153 +0,0 @@
 require(xgboost)
 require(methods)
 # helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
 # use this only for demo purpose adopted from
 # https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
 read.libsvm <- function(fname, maxcol) {
  content <- readLines(fname)
  nline <- length(content)
  label <- numeric(nline)
  mat <- matrix(0, nline, maxcol + 1)
  for (i in 1:nline) {
    arr <- as.vector(strsplit(content[i], " ")[[1]])
    label[i] <- as.numeric(arr[[1]])
    for (j in 2:length(arr)) {
      kv <- strsplit(arr[j], ":")[[1]]
      # to avoid 0 index
      findex <- as.integer(kv[1]) + 1
      fvalue <- as.numeric(kv[2])
      mat[i, findex] <- fvalue
    }
  }
  mat <- as(mat, "sparseMatrix")
  return(list(label = label, data = mat))
 }
 ############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
 # Directly read in local file
 dtrain <- xgb.DMatrix("agaricus.txt.train")
 class(dtrain)
 # read file in R
 csc <- read.libsvm("agaricus.txt.train", 126)
 y <- csc$label
 x <- csc$data
 # x as Sparse Matrix
 class(x)
 dtrain <- xgb.DMatrix(x, label = y)
 # x as dense matrix
 dense.x <- as.matrix(x)
 dtrain <- xgb.DMatrix(dense.x, label = y)
 ############################ Test xgboost with local file, sparse matrix and dense matrix in R.
 # Test with DMatrix object
 bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")
 # Verbose = 0,1,2
 bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 0)
 bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 1)
 bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 2)
 # Test with local file
 bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
               objective = "binary:logistic")
 # Test with Sparse Matrix
 bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")
 # Test with dense Matrix
 bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")
 ############################ Test predict
 # Prediction with DMatrix object
 dtest <- xgb.DMatrix("agaricus.txt.test")
 pred <- predict(bst, dtest)
 # Prediction with local test file
 pred <- predict(bst, "agaricus.txt.test")
 # Prediction with Sparse Matrix
 csc <- read.libsvm("agaricus.txt.test", 126)
 test.y <- csc$label
 test.x <- csc$data
 pred <- predict(bst, test.x)
 # Extrac label with getinfo
 labels <- getinfo(dtest, "label")
 err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
 print(paste("error=", err))
 ############################ Save and load model to hard disk
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 # load binary model to R
 bst <- xgb.load("xgboost.model")
 pred <- predict(bst, test.x)
 # save model to text file
 xgb.dump(bst, "dump.raw.txt")
 # save model to text file, with feature map
 xgb.dump(bst, "dump.nice.txt", "featmap.txt")
 # save a DMatrix object to hard disk
 xgb.DMatrix.save(dtrain, "dtrain.buffer")
 # load a DMatrix object to R
 dtrain <- xgb.DMatrix("dtrain.buffer")
 ############################ More flexible training function xgb.train
 param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
 watchlist <- list(eval = dtest, train = dtrain)
 # training xgboost model
 bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
 ############################ cutomsized loss function
 param <- list(max_depth = 2, eta = 1, silent = 1)
 # note: for customized objective function, we leave objective as default note: what we are getting is
 # margin value in prediction you must know what you are doing
 # user define objective function, given prediction, return gradient and second order gradient this is
 # loglikelihood loss
 logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
 # user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
 # you do customized loss function, the default prediction value is margin this may make buildin
 # evalution metric not function properly for example, we are doing logistic loss, the prediction is
 # score before logistic transformation the buildin evaluation error assumes input is after logistic
 # transformation Take this in mind when you use the customization, and maybe you need write customized
 # evaluation function
 evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
 # training with customized objective, we can also do step by step training simply look at xgboost.py's
 # implementation of train
 bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
--- a/R-package/inst/examples/featmap.txt
+++ b/R-package/inst/examples/featmap.txt
@ -1,126 +0,0 @@
 0	cap-shape=bell	i
 1	cap-shape=conical	i
 2	cap-shape=convex	i
 3	cap-shape=flat	i
 4	cap-shape=knobbed	i
 5	cap-shape=sunken	i
 6	cap-surface=fibrous	i
 7	cap-surface=grooves	i
 8	cap-surface=scaly	i
 9	cap-surface=smooth	i
 10	cap-color=brown	i
 11	cap-color=buff	i
 12	cap-color=cinnamon	i
 13	cap-color=gray	i
 14	cap-color=green	i
 15	cap-color=pink	i
 16	cap-color=purple	i
 17	cap-color=red	i
 18	cap-color=white	i
 19	cap-color=yellow	i
 20	bruises?=bruises	i
 21	bruises?=no	i
 22	odor=almond	i
 23	odor=anise	i
 24	odor=creosote	i
 25	odor=fishy	i
 26	odor=foul	i
 27	odor=musty	i
 28	odor=none	i
 29	odor=pungent	i
 30	odor=spicy	i
 31	gill-attachment=attached	i
 32	gill-attachment=descending	i
 33	gill-attachment=free	i
 34	gill-attachment=notched	i
 35	gill-spacing=close	i
 36	gill-spacing=crowded	i
 37	gill-spacing=distant	i
 38	gill-size=broad	i
 39	gill-size=narrow	i
 40	gill-color=black	i
 41	gill-color=brown	i
 42	gill-color=buff	i
 43	gill-color=chocolate	i
 44	gill-color=gray	i
 45	gill-color=green	i
 46	gill-color=orange	i
 47	gill-color=pink	i
 48	gill-color=purple	i
 49	gill-color=red	i
 50	gill-color=white	i
 51	gill-color=yellow	i
 52	stalk-shape=enlarging	i
 53	stalk-shape=tapering	i
 54	stalk-root=bulbous	i
 55	stalk-root=club	i
 56	stalk-root=cup	i
 57	stalk-root=equal	i
 58	stalk-root=rhizomorphs	i
 59	stalk-root=rooted	i
 60	stalk-root=missing	i
 61	stalk-surface-above-ring=fibrous	i
 62	stalk-surface-above-ring=scaly	i
 63	stalk-surface-above-ring=silky	i
 64	stalk-surface-above-ring=smooth	i
 65	stalk-surface-below-ring=fibrous	i
 66	stalk-surface-below-ring=scaly	i
 67	stalk-surface-below-ring=silky	i
 68	stalk-surface-below-ring=smooth	i
 69	stalk-color-above-ring=brown	i
 70	stalk-color-above-ring=buff	i
 71	stalk-color-above-ring=cinnamon	i
 72	stalk-color-above-ring=gray	i
 73	stalk-color-above-ring=orange	i
 74	stalk-color-above-ring=pink	i
 75	stalk-color-above-ring=red	i
 76	stalk-color-above-ring=white	i
 77	stalk-color-above-ring=yellow	i
 78	stalk-color-below-ring=brown	i
 79	stalk-color-below-ring=buff	i
 80	stalk-color-below-ring=cinnamon	i
 81	stalk-color-below-ring=gray	i
 82	stalk-color-below-ring=orange	i
 83	stalk-color-below-ring=pink	i
 84	stalk-color-below-ring=red	i
 85	stalk-color-below-ring=white	i
 86	stalk-color-below-ring=yellow	i
 87	veil-type=partial	i
 88	veil-type=universal	i
 89	veil-color=brown	i
 90	veil-color=orange	i
 91	veil-color=white	i
 92	veil-color=yellow	i
 93	ring-number=none	i
 94	ring-number=one	i
 95	ring-number=two	i
 96	ring-type=cobwebby	i
 97	ring-type=evanescent	i
 98	ring-type=flaring	i
 99	ring-type=large	i
 100	ring-type=none	i
 101	ring-type=pendant	i
 102	ring-type=sheathing	i
 103	ring-type=zone	i
 104	spore-print-color=black	i
 105	spore-print-color=brown	i
 106	spore-print-color=buff	i
 107	spore-print-color=chocolate	i
 108	spore-print-color=green	i
 109	spore-print-color=orange	i
 110	spore-print-color=purple	i
 111	spore-print-color=white	i
 112	spore-print-color=yellow	i
 113	population=abundant	i
 114	population=clustered	i
 115	population=numerous	i
 116	population=scattered	i
 117	population=several	i
 118	population=solitary	i
 119	habitat=grasses	i
 120	habitat=leaves	i
 121	habitat=meadows	i
 122	habitat=paths	i
 123	habitat=urban	i
 124	habitat=waste	i
 125	habitat=woods	i
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@ -80,12 +80,15 @@ Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichma
 <<Training and prediction with iris>>=
 library(xgboost)
-data(iris)
+data(agaricus.train, package='xgboost')
-bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), 
+data(agaricus.test, package='xgboost')
-               nrounds = 5)
+train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, 
               nround = 2, objective = "binary:logistic")
 xgb.save(bst, 'model.save')
 bst = xgb.load('model.save')
-pred <- predict(bst, as.matrix(iris[,1:4]))
+pred <- predict(bst, test$data)
@
 \verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
@ -102,17 +105,19 @@ The output looks like
 \begin{verbatim}
 booster[0]:
-0:[f2<2.45] yes=1,no=2,missing=1
+0:[f28<1.00001] yes=1,no=2,missing=2
-    1:leaf=0.147059
+  1:[f108<1.00001] yes=3,no=4,missing=4
-    2:[f3<1.65] yes=3,no=4,missing=3
+    3:leaf=1.85965
-        3:leaf=0.464151
+    4:leaf=-1.94071
-        4:leaf=0.722449
+  2:[f55<1.00001] yes=5,no=6,missing=6
    5:leaf=-1.70044
    6:leaf=1.71218
 booster[1]:
-0:[f2<2.45] yes=1,no=2,missing=1
+0:[f59<1.00001] yes=1,no=2,missing=2
-    1:leaf=0.103806
+  1:leaf=-6.23624
-    2:[f2<4.85] yes=3,no=4,missing=3
+  2:[f28<1.00001] yes=3,no=4,missing=4
-        3:leaf=0.316341
+    3:leaf=-0.96853
-        4:leaf=0.510365
+    4:leaf=0.784718
 \end{verbatim}
 It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
@ -121,18 +126,16 @@ training from initial prediction value, weighted training instance.
 We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
 <<xgb.DMatrix>>=
-iris.mat <- as.matrix(iris[,1:4])
+dtrain <- xgb.DMatrix(train$data, label = train$label)
-iris.label <- as.numeric(iris[,5]=='setosa')
+class(dtrain)
-diris <- xgb.DMatrix(iris.mat, label = iris.label)
+head(getinfo(dtrain,'label'))
 class(diris)
 getinfo(diris,'label')
@
 We can also save the matrix to a binary file. Then load it simply with 
 \verb@xgb.DMatrix@
 <<save model>>=
-xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
+xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
-diris = xgb.DMatrix('iris.xgb.DMatrix')
+dtrain = xgb.DMatrix('xgb.DMatrix')
@
 \section{Advanced Examples}
@ -157,11 +160,11 @@ evalerror <- function(preds, dtrain) {
  return(list(metric = "MSE", value = err))
 }
-dtest <- slice(diris,1:100)
+dtest <- xgb.DMatrix(test$data, label = test$label)
-watchlist <- list(eval = dtest, train = diris)
+watchlist <- list(eval = dtest, train = dtrain)
 param <- list(max_depth = 2, eta = 1, silent = 1)
-bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
+bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
@
 The gradient and second order gradient is required for the output of customized