commit
9267e3b368
@ -36,6 +36,8 @@
|
||||
#' @param feval custimized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain,
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
@ -73,7 +75,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
}
|
||||
|
||||
folds <- xgb.cv.mknfold(dtrain, nfold, params)
|
||||
history <- list()
|
||||
history <- c()
|
||||
for (i in 1:nrounds) {
|
||||
msg <- list()
|
||||
for (k in 1:nfold) {
|
||||
@ -83,8 +85,12 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
"\t")[[1]]
|
||||
}
|
||||
ret <- xgb.cv.aggcv(msg, showsd)
|
||||
history <- append(history, ret)
|
||||
history <- c(history, ret)
|
||||
cat(paste(ret, "\n", sep=""))
|
||||
}
|
||||
return (TRUE)
|
||||
return (history)
|
||||
}
|
||||
|
||||
xgb.cv.strip.numeric <- function(x) {
|
||||
as.numeric(strsplit(regmatches(x, regexec("test-(.*):(.*)$", x))[[1]][3], "\\+")[[1]])
|
||||
}
|
||||
|
||||
@ -24,6 +24,8 @@
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
#' information of performance. If 2, xgboost will print information of both
|
||||
#' performance and construction progress information
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
@ -74,7 +76,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label} the label for each record
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
@ -101,7 +103,7 @@ NULL
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label} the label for each record
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
@ -116,5 +118,5 @@ NULL
|
||||
#' @name agaricus.test
|
||||
#' @usage data(agaricus.test)
|
||||
#' @format A list containing a label vector, and a dgCMatrix object with 1611
|
||||
#' rows and 127 variables
|
||||
#' rows and 126 variables
|
||||
NULL
|
||||
|
||||
@ -4,3 +4,4 @@ boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
create_sparse_matrix
|
||||
|
||||
@ -6,6 +6,7 @@ XGBoost R Feature Walkthrough
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
* [Generalized Linear Model](generalized_linear_model.R)
|
||||
* [Cross validation](cross_validation.R)
|
||||
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
|
||||
|
||||
Benchmarks
|
||||
====
|
||||
|
||||
@ -88,6 +88,9 @@ pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
# Finally, you can dump the tree you learned using xgb.dump into a text file
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# You can dump the tree you learned using xgb.dump into a text file
|
||||
xgb.dump(bst, "dump.raw.txt", with.stats = T)
|
||||
|
||||
# Finally, you can check which features are the most important.
|
||||
print("Most important features (look at column Gain):")
|
||||
print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))
|
||||
|
||||
65
R-package/demo/create_sparse_matrix.R
Normal file
65
R-package/demo/create_sparse_matrix.R
Normal file
@ -0,0 +1,65 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
require(vcd) #Available in Cran. Used for its dataset with categorical values.
|
||||
|
||||
# According to its documentation, Xgboost works only on numbers.
|
||||
# Sometimes the dataset we have to work on have categorical data.
|
||||
# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
|
||||
#
|
||||
# In R, categorical variable is called Factor.
|
||||
# Type ?factor in console for more information.
|
||||
#
|
||||
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
|
||||
# The method we are going to see is usually called "one hot encoding".
|
||||
|
||||
#load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
|
||||
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
|
||||
# Let's have a look to the data.table
|
||||
cat("Print the dataset\n")
|
||||
print(df)
|
||||
|
||||
# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
|
||||
cat("Structure of the dataset\n")
|
||||
str(df)
|
||||
|
||||
# We remove the Age column which has no interest for the purpose of this demo.
|
||||
df[,Age:= NULL]
|
||||
|
||||
# List the different values for the column Treatment: Placebo, Treated.
|
||||
cat("Values of the categorical feature Treatment\n")
|
||||
print(levels(df[,Treatment]))
|
||||
|
||||
# Next step, we will transform the categorical data to dummy variables.
|
||||
# This method is also called dummy encoding.
|
||||
# The purpose is to transform each value of each categorical feature in one binary feature.
|
||||
#
|
||||
# For example, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary, meaning that it will contain the value 1 in the new column Placebo and 0 in the new column Treated, for observations which had the value Placebo in column Treatment before the transformation.
|
||||
#
|
||||
# Formulae Improved~.-1 means transform all categorical features but column Improved to binary values.
|
||||
# Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
|
||||
|
||||
cat("Encoding of the sparse Matrix\n")
|
||||
print(sparse_matrix)
|
||||
|
||||
# Create the output vector (not sparse)
|
||||
# 1. Set, for all rows, field in Y column to 0;
|
||||
# 2. set Y to 1 when Improved == Marked;
|
||||
# 3. Return Y column
|
||||
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 3,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
importance = xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
||||
print(importance)
|
||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is having received a Placebo or not.
|
||||
@ -5,7 +5,7 @@
|
||||
\alias{agaricus.test}
|
||||
\title{Test part from Mushroom Data Set}
|
||||
\format{A list containing a label vector, and a dgCMatrix object with 1611
|
||||
rows and 127 variables}
|
||||
rows and 126 variables}
|
||||
\usage{
|
||||
data(agaricus.test)
|
||||
}
|
||||
@ -18,7 +18,7 @@ This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -18,7 +18,7 @@ This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -32,6 +32,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
|
||||
\item{label}{option field, when data is Matrix}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{showsd}{boolean, whether show standard deviation of cross validation}
|
||||
|
||||
\item{metrics,}{list of evaluation metrics to be used in corss validation,
|
||||
|
||||
@ -13,6 +13,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
|
||||
\item{label}{the response variable. User should not set this field,}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
|
||||
@ -19,6 +19,7 @@ Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washin
|
||||
|
||||
What's New
|
||||
=====
|
||||
* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
|
||||
* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
|
||||
* Thanks to Bing Xu, [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) allows you to use xgboost from Julia
|
||||
* See the updated [demo folder](demo) for feature walkthrough
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user