Lint fix on infix operators

This commit is contained in:
terrytangyuan 2015-10-28 23:04:45 -04:00
parent d7fce99564
commit 8bae715994
12 changed files with 47 additions and 47 deletions

View File

@ -34,8 +34,8 @@ setMethod("slice", signature = "xgb.DMatrix",
attr_list <- attributes(object) attr_list <- attributes(object)
nr <- xgb.numrow(object) nr <- xgb.numrow(object)
len <- sapply(attr_list,length) len <- sapply(attr_list,length)
ind <- which(len==nr) ind <- which(len == nr)
if (length(ind)>0) { if (length(ind) > 0) {
nms <- names(attr_list)[ind] nms <- names(attr_list)[ind]
for (i in 1:length(ind)) { for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset] attr(ret,nms[i]) <- attr(object,nms[i])[idxset]

View File

@ -15,14 +15,14 @@ xgb.setinfo <- function(dmat, name, info) {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix") stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
} }
if (name == "label") { if (name == "label") {
if (length(info)!=xgb.numrow(dmat)) if (length(info) != xgb.numrow(dmat))
stop("The length of labels must equal to the number of rows in the input data") stop("The length of labels must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost") PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
if (name == "weight") { if (name == "weight") {
if (length(info)!=xgb.numrow(dmat)) if (length(info) != xgb.numrow(dmat))
stop("The length of weights must equal to the number of rows in the input data") stop("The length of weights must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost") PACKAGE = "xgboost")
@ -36,7 +36,7 @@ xgb.setinfo <- function(dmat, name, info) {
return(TRUE) return(TRUE)
} }
if (name == "group") { if (name == "group") {
if (sum(info)!=xgb.numrow(dmat)) if (sum(info) != xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data") stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost") PACKAGE = "xgboost")
@ -251,7 +251,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
# make simple non-stratified folds # make simple non-stratified folds
kstep <- length(randidx) %/% nfold kstep <- length(randidx) %/% nfold
folds <- list() folds <- list()
for (i in 1:(nfold-1)) { for (i in 1:(nfold - 1)) {
folds[[i]] <- randidx[1:kstep] folds[[i]] <- randidx[1:kstep]
randidx <- setdiff(randidx, folds[[i]]) randidx <- setdiff(randidx, folds[[i]])
} }
@ -310,7 +310,7 @@ xgb.createFolds <- function(y, k = 10)
## At most, we will use quantiles. If the sample ## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified ## is too small, we just do regular unstratified
## CV ## CV
cuts <- floor(length(y)/k) cuts <- floor(length(y) / k)
if(cuts < 2) cuts <- 2 if(cuts < 2) cuts <- 2
if(cuts > 5) cuts <- 5 if(cuts > 5) cuts <- 5
y <- cut(y, y <- cut(y,

View File

@ -130,7 +130,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# if (!is.null(params$eval_metric) && !is.null(feval)) # if (!is.null(params$eval_metric) && !is.null(feval))
# stop("xgb.cv: cannot assign two different evaluation metrics") # stop("xgb.cv: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric)) if (!is.null(params$eval_metric))
if (class(params$eval_metric)=='function') { if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric feval <- params$eval_metric
params[['eval_metric']] <- NULL params[['eval_metric']] <- NULL
} }
@ -158,7 +158,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
bestInd <- 0 bestInd <- 0
earlyStopflag <- FALSE earlyStopflag <- FALSE
if (length(metrics)>1) if (length(metrics) > 1)
warning('Only the first metric is used for early stopping process.') warning('Only the first metric is used for early stopping process.')
} }
@ -187,19 +187,19 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
ret <- xgb.cv.aggcv(msg, showsd) ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret) history <- c(history, ret)
if(verbose) if(verbose)
if (0 == (i-1L)%%print.every.n) if (0 == (i - 1L) %% print.every.n)
cat(ret, "\n", sep="") cat(ret, "\n", sep="")
# early_Stopping # early_Stopping
if (!is.null(early.stop.round)){ if (!is.null(early.stop.round)){
score <- strsplit(ret,'\\s+')[[1]][1+length(metrics)+2] score <- strsplit(ret,'\\s+')[[1]][1 + length(metrics) + 2]
score <- strsplit(score,'\\+|:')[[1]][[2]] score <- strsplit(score,'\\+|:')[[1]][[2]]
score <- as.numeric(score) score <- as.numeric(score)
if ((maximize && score > bestScore) || (!maximize && score < bestScore)) { if ((maximize && score > bestScore) || (!maximize && score < bestScore)) {
bestScore <- score bestScore <- score
bestInd <- i bestInd <- i
} else { } else {
if (i-bestInd >= early.stop.round) { if (i - bestInd >= early.stop.round) {
earlyStopflag <- TRUE earlyStopflag <- TRUE
cat('Stopping. Best iteration:',bestInd) cat('Stopping. Best iteration:',bestInd)
break break

View File

@ -79,7 +79,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
} }
if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) { if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.") stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
} }
@ -110,7 +110,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split]) d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
apply(c & d, 2, . %>% target %>% sum) -> vec apply(c & d, 2, . %>% target %>% sum) -> vec
result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL] result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
} }
} }
result result
@ -119,13 +119,13 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
treeDump <- function(feature_names, text, keepDetail){ treeDump <- function(feature_names, text, keepDetail){
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequence = Frequence / sum(Frequence))][order(Gain, decreasing = T)]
result result
} }
linearDump <- function(feature_names, text){ linearDump <- function(feature_names, text){
which(text == "weight:") %>% {a <- .+1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) which(text == "weight:") %>% {a <- . + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
} }
# Avoid error messages during CRAN check. # Avoid error messages during CRAN check.

View File

@ -86,7 +86,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
text <- readLines(filename_dump) %>% str_trim(side = "both") text <- readLines(filename_dump) %>% str_trim(side = "both")
} }
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
@ -96,15 +96,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
allTrees <- data.table() allTrees <- data.table()
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
for(i in 1:n_round){ for(i in 1:n_round){
tree <- text[(position[i]+1):(position[i+1]-1)] tree <- text[(position[i] + 1):(position[i + 1] - 1)]
# avoid tree made of a leaf only (no split) # avoid tree made of a leaf only (no split)
if(length(tree) <2) next if(length(tree) < 2) next
treeID <- i-1 treeID <- i - 1
notLeaf <- str_match(tree, "leaf") %>% is.na notLeaf <- str_match(tree, "leaf") %>% is.na
leaf <- notLeaf %>% not %>% tree[.] leaf <- notLeaf %>% not %>% tree[.]
@ -128,7 +128,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex)) qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID] dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
} }

View File

@ -44,9 +44,9 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
importance_matrix[,"Cluster":=clusters$cluster %>% as.character] importance_matrix[,"Cluster" := clusters$cluster %>% as.character]
plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
return(plot) return(plot)
} }

View File

@ -151,14 +151,14 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
if (!is.null(params$objective) && !is.null(obj)) if (!is.null(params$objective) && !is.null(obj))
stop("xgb.train: cannot assign two different objectives") stop("xgb.train: cannot assign two different objectives")
if (!is.null(params$objective)) if (!is.null(params$objective))
if (class(params$objective)=='function') { if (class(params$objective) == 'function') {
obj <- params$objective obj <- params$objective
params$objective <- NULL params$objective <- NULL
} }
if (!is.null(params$eval_metric) && !is.null(feval)) if (!is.null(params$eval_metric) && !is.null(feval))
stop("xgb.train: cannot assign two different evaluation metrics") stop("xgb.train: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric)) if (!is.null(params$eval_metric))
if (class(params$eval_metric)=='function') { if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric feval <- params$eval_metric
params$eval_metric <- NULL params$eval_metric <- NULL
} }
@ -188,7 +188,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
bestInd <- 0 bestInd <- 0
earlyStopflag <- FALSE earlyStopflag <- FALSE
if (length(watchlist)>1) if (length(watchlist) > 1)
warning('Only the first data set in watchlist is used for early stopping process.') warning('Only the first data set in watchlist is used for early stopping process.')
} }
@ -200,17 +200,17 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
if (length(watchlist) != 0) { if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
if (0== ( (i-1) %% print.every.n)) if (0 == ( (i - 1) %% print.every.n))
cat(paste(msg, "\n", sep="")) cat(paste(msg, "\n", sep = ""))
if (!is.null(early.stop.round)) if (!is.null(early.stop.round))
{ {
score <- strsplit(msg,':|\\s+')[[1]][3] score <- strsplit(msg,':|\\s+')[[1]][3]
score <- as.numeric(score) score <- as.numeric(score)
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) { if ((maximize && score > bestScore) || (!maximize && score < bestScore)) {
bestScore <- score bestScore <- score
bestInd <- i bestInd <- i
} else { } else {
if (i-bestInd>=early.stop.round) { if (i - bestInd >= early.stop.round) {
earlyStopflag <- TRUE earlyStopflag <- TRUE
cat('Stopping. Best iteration:',bestInd) cat('Stopping. Best iteration:',bestInd)
break break

View File

@ -18,7 +18,7 @@ test_that("early stopping", {
res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE) early.stop.round = 3, maximize = FALSE)
expect_true(nrow(res)<20) expect_true(nrow(res) < 20)
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE) early.stop.round = 3, maximize = FALSE)

View File

@ -13,14 +13,14 @@ test_that("custom objective works", {
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds)) preds <- 1 / (1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels) err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
@ -34,13 +34,13 @@ test_that("custom objective works", {
logregobjattr <- function(preds, dtrain) { logregobjattr <- function(preds, dtrain) {
labels <- attr(dtrain, 'label') labels <- attr(dtrain, 'label')
preds <- 1/(1 + exp(-preds)) preds <- 1 / (1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
param <- list(max.depth=2, eta=1, nthread = 2, silent=1, param <- list(max.depth=2, eta=1, nthread = 2, silent = 1,
objective=logregobjattr, eval_metric=evalerror) objective = logregobjattr, eval_metric = evalerror)
bst <- xgb.train(param, dtrain, num_round, watchlist) bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064) expect_equal(length(bst$raw), 1064)

View File

@ -8,11 +8,11 @@ require(vcd)
data(Arthritis) data(Arthritis)
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
df <- data.table(Arthritis, keep.rownames = F) df <- data.table(Arthritis, keep.rownames = F)
df[,AgeDiscret:= as.factor(round(Age/10,0))] df[,AgeDiscret := as.factor(round(Age / 10,0))]
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
df[,ID:=NULL] df[,ID := NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) sparse_matrix <- sparse.model.matrix(Improved ~ . -1, data = df)
output_vector <- df[,Y:=0][Improved == "Marked",Y:=1][,Y] output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")

View File

@ -23,5 +23,5 @@ test_that("Code Lint", {
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter, trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
trailing_whitespace_linter=lintr::trailing_whitespace_linter trailing_whitespace_linter=lintr::trailing_whitespace_linter
) )
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
}) })

View File

@ -4,10 +4,10 @@ require(xgboost)
test_that("poisson regression works", { test_that("poisson regression works", {
data(mtcars) data(mtcars)
bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11], bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11],
objective='count:poisson',nrounds=5) objective = 'count:poisson', nrounds=5)
expect_equal(class(bst), "xgb.Booster") expect_equal(class(bst), "xgb.Booster")
pred = predict(bst,as.matrix(mtcars[,-11])) pred <- predict(bst,as.matrix(mtcars[, -11]))
expect_equal(length(pred), 32) expect_equal(length(pred), 32)
sqrt(mean((pred-mtcars[,11])^2)) sqrt(mean((pred - mtcars[,11]) ^ 2))
}) })