Merge pull request #124 from pommedeterresautee/master

Add a new function to see importance of features in a model
This commit is contained in:
Tianqi Chen 2014-12-28 20:06:55 +08:00
commit 0c7e090c19
21 changed files with 141 additions and 30 deletions

1
.gitignore vendored
View File

@ -44,3 +44,4 @@ Debug
*dump
*save
*csv
.Rproj.user

View File

@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: eXtreme Gradient Boosting
Version: 0.3-2
Date: 2014-08-23
Version: 0.3-3
Date: 2014-12-28
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
Maintainer: Tong He <hetong007@gmail.com>
Description: This package is a R wrapper of xgboost, which is short for eXtreme
@ -21,4 +21,6 @@ Depends:
R (>= 2.10)
Imports:
Matrix (>= 1.1-0),
methods
methods,
data.table (>= 1.9),
magrittr (>= 1.5)

View File

@ -1,4 +1,4 @@
# Generated by roxygen2 (4.0.1): do not edit by hand
# Generated by roxygen2 (4.1.0): do not edit by hand
export(getinfo)
export(setinfo)
@ -7,6 +7,7 @@ export(xgb.DMatrix)
export(xgb.DMatrix.save)
export(xgb.cv)
export(xgb.dump)
export(xgb.importance)
export(xgb.load)
export(xgb.save)
export(xgb.train)
@ -15,3 +16,6 @@ exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(data.table,":=")
importFrom(data.table,data.table)
importFrom(magrittr,"%>%")

View File

@ -0,0 +1,56 @@
#' Show importance of features in a model
#'
#' Read a xgboost model in text file format.
#' Can be tree or linear model (text dump of linear model are only supported in dev version of Xgboost for now).
#'
#' Return a data.table of the features with their weight.
#' #'
#' @importFrom data.table data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix.
#' @param filename_dump the name of the text file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels (outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#' test <- agaricus.test
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump')
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column name of the sparse matrix.
#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump')
#'
#' @export
xgb.importance <- function(feature_names, filename_dump){
text <- readLines(filename_dump)
if(text[2] == "bias:"){
result <- linearDump(feature_names, text)
} else {
result <- treeDump(feature_names, text)
}
result
}
treeDump <- function(feature_names, text){
result <- c()
for(line in text){
p <- regexec("\\[f.*\\]", line) %>% regmatches(line, .)
if (length(p[[1]]) > 0) {
splits <- sub("\\[f", "", p[[1]]) %>% sub("\\]", "", .) %>% strsplit("<") %>% .[[1]] %>% as.numeric
result <- c(result, feature_names[splits[1]+ 1])
}
}
#1. Reduce, 2. %, 3. reorder - bigger top, 4. remove temp col
data.table(Feature = result)[,.N, by = Feature][, Weight:= N /sum(N)][order(-rank(Weight))][,-2,with=F]
}
linearDump <- function(feature_names, text){
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
}

Binary file not shown.

Binary file not shown.

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
\docType{methods}
\name{getinfo}
\alias{getinfo}
@ -12,9 +13,9 @@ getinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{name}{the name of the field to get}
\item{...}{other parameters}
\item{name}{the name of the field to get}
}
\description{
Get information of an xgb.DMatrix object

View File

@ -1,11 +1,12 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
ntreelimit = NULL)
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
\docType{methods}
\name{setinfo}
\alias{setinfo}
@ -12,11 +13,11 @@ setinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
\item{...}{other parameters}
}
\description{
Set information of an xgb.DMatrix object

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
\docType{methods}
\name{slice}
\alias{slice}
@ -13,9 +14,9 @@ slice(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed}
}
\description{
Get a new DMatrix containing the specified rows of

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}

View File

@ -1,10 +1,12 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
metrics = list(), obj = NULL, feval = NULL, ...)
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
feval = NULL, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.dump.R
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}

View File

@ -0,0 +1,33 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.importance.R
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}
\usage{
xgb.importance(feature_names, filename_dump)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix.}
\item{filename_dump}{the name of the text file.}
}
\description{
Read a xgboost model in text file format. Return a data.table of the features with their weight.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels (outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump')
#agaricus.test$data@Dimnames[[2]] represents the column name of the sparse matrix.
xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump')
}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.load.R
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.save.R
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}

View File

@ -1,4 +1,5 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.train.R
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}

View File

@ -1,10 +1,11 @@
% Generated by roxygen2 (4.0.1): do not edit by hand
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
\usage{
xgboost(data = NULL, label = NULL, params = list(), nrounds,
verbose = 1, ...)
xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
nrounds, verbose = 1, ...)
}
\arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or