From d7fce99564221f942eafa241ba2b999ba4db0179 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 28 Oct 2015 22:22:51 -0400 Subject: [PATCH 1/4] Lint fix on consistent assignment --- R-package/R/predict.xgb.Booster.R | 2 +- R-package/R/utils.R | 2 +- R-package/R/xgb.cv.R | 18 +++++----- R-package/R/xgb.importance.R | 2 +- R-package/R/xgb.model.dt.tree.R | 2 +- R-package/R/xgb.train.R | 44 ++++++++++++------------- R-package/tests/testthat/test_basic.R | 18 +++++----- R-package/tests/testthat/test_helpers.R | 4 +-- 8 files changed, 46 insertions(+), 46 deletions(-) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 9cc1867da..432581e76 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -48,7 +48,7 @@ setMethod("predict", signature = "xgb.Booster", stop("predict: ntreelimit must be equal to or greater than 1") } } - option = 0 + option <- 0 if (outputmargin) { option <- option + 1 } diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 459eb068e..2c7c74fc3 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -261,7 +261,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { ret <- list() for (k in 1:nfold) { dtest <- slice(dall, folds[[k]]) - didx = c() + didx <- c() for (i in 1:nfold) { if (i != k) { didx <- append(didx, folds[[i]]) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 3f1be704f..af79bde4e 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -124,15 +124,15 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = stop("xgb.cv: cannot assign two different objectives") if (!is.null(params$objective)) if (class(params$objective) == 'function') { - obj = params$objective - params[['objective']] = NULL + obj <- params$objective + params[['objective']] <- NULL } # if (!is.null(params$eval_metric) && !is.null(feval)) # stop("xgb.cv: cannot assign two different evaluation metrics") if (!is.null(params$eval_metric)) if (class(params$eval_metric)=='function') { - feval = params$eval_metric - params[['eval_metric']] = NULL + feval <- params$eval_metric + params[['eval_metric']] <- NULL } # Early Stopping @@ -144,9 +144,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = if (is.null(maximize)) { if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { - maximize = FALSE + maximize <- FALSE } else { - maximize = TRUE + maximize <- TRUE } } @@ -167,16 +167,16 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = mat_pred <- FALSE if (!is.null(obj_type) && obj_type == 'multi:softprob') { - num_class = params[['num_class']] + num_class <- params[['num_class']] if (is.null(num_class)) stop('must set num_class to use softmax') predictValues <- matrix(0,xgb.numrow(dtrain),num_class) - mat_pred = TRUE + mat_pred <- TRUE } else predictValues <- rep(0,xgb.numrow(dtrain)) history <- c() - print.every.n = max(as.integer(print.every.n), 1L) + print.every.n <- max(as.integer(print.every.n), 1L) for (i in 1:nrounds) { msg <- list() for (k in 1:nfold) { diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 0b0703587..14c5bbd44 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -125,7 +125,7 @@ treeDump <- function(feature_names, text, keepDetail){ } linearDump <- function(feature_names, text){ - which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) + which(text == "weight:") %>% {a <- .+1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) } # Avoid error messages during CRAN check. diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index cef988962..b0f5ee279 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -81,7 +81,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model } if(!is.null(model)){ - text = xgb.dump(model = model, with.stats = T) + text <- xgb.dump(model = model, with.stats = T) } else if(!is.null(filename_dump)){ text <- readLines(filename_dump) %>% str_trim(side = "both") } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index b1d79d866..4bf1d36f6 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -140,27 +140,27 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), warning('watchlist is provided but verbose=0, no evaluation information will be printed') } - dot.params = list(...) - nms.params = names(params) - nms.dot.params = names(dot.params) + dot.params <- list(...) + nms.params <- names(params) + nms.dot.params <- names(dot.params) if (length(intersect(nms.params,nms.dot.params))>0) stop("Duplicated term in parameters. Please check your list of params.") - params = append(params, dot.params) + params <- append(params, dot.params) # customized objective and evaluation metric interface if (!is.null(params$objective) && !is.null(obj)) stop("xgb.train: cannot assign two different objectives") if (!is.null(params$objective)) if (class(params$objective)=='function') { - obj = params$objective - params$objective = NULL + obj <- params$objective + params$objective <- NULL } if (!is.null(params$eval_metric) && !is.null(feval)) stop("xgb.train: cannot assign two different evaluation metrics") if (!is.null(params$eval_metric)) if (class(params$eval_metric)=='function') { - feval = params$eval_metric - params$eval_metric = NULL + feval <- params$eval_metric + params$eval_metric <- NULL } # Early stopping @@ -174,19 +174,19 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (is.null(maximize)) { if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { - maximize = FALSE + maximize <- FALSE } else { - maximize = TRUE + maximize <- TRUE } } if (maximize) { - bestScore = 0 + bestScore <- 0 } else { - bestScore = Inf + bestScore <- Inf } - bestInd = 0 - earlyStopflag = FALSE + bestInd <- 0 + earlyStopflag <- FALSE if (length(watchlist)>1) warning('Only the first data set in watchlist is used for early stopping process.') @@ -195,7 +195,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) - print.every.n=max( as.integer(print.every.n), 1L) + print.every.n <- max( as.integer(print.every.n), 1L) for (i in 1:nrounds) { succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { @@ -204,14 +204,14 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), cat(paste(msg, "\n", sep="")) if (!is.null(early.stop.round)) { - score = strsplit(msg,':|\\s+')[[1]][3] - score = as.numeric(score) + score <- strsplit(msg,':|\\s+')[[1]][3] + score <- as.numeric(score) if ((maximize && score>bestScore) || (!maximize && score=early.stop.round) { - earlyStopflag = TRUE + earlyStopflag <- TRUE cat('Stopping. Best iteration:',bestInd) break } @@ -226,8 +226,8 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } bst <- xgb.Booster.check(bst) if (!is.null(early.stop.round)) { - bst$bestScore = bestScore - bst$bestInd = bestInd + bst$bestScore <- bestScore + bst$bestInd <- bestInd } return(bst) } diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 791f1246c..88bd905ca 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -4,30 +4,30 @@ context("basic functions") data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') -train = agaricus.train -test = agaricus.test +train <- agaricus.train +test <- agaricus.test test_that("train and predict", { - bst = xgboost(data = train$data, label = train$label, max.depth = 2, + bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") - pred = predict(bst, test$data) + pred <- predict(bst, test$data) }) test_that("early stopping", { - res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, + res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", early.stop.round = 3, maximize = FALSE) expect_true(nrow(res)<20) - bst = xgboost(data = train$data, label = train$label, max.depth = 2, + bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", early.stop.round = 3, maximize = FALSE) - pred = predict(bst, test$data) + pred <- predict(bst, test$data) }) test_that("save_period", { - bst = xgboost(data = train$data, label = train$label, max.depth = 2, + bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", save_period = 10, save_name = "xgb.model") - pred = predict(bst, test$data) + pred <- predict(bst, test$data) }) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 4d80146e3..9cef61c49 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -11,8 +11,8 @@ df <- data.table(Arthritis, keep.rownames = F) df[,AgeDiscret:= as.factor(round(Age/10,0))] df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] df[,ID:=NULL] -sparse_matrix = sparse.model.matrix(Improved~.-1, data = df) -output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] +sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) +output_vector <- df[,Y:=0][Improved == "Marked",Y:=1][,Y] bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") From 8bae7159944d9fafbb5f02d933273ef3c77ca0ef Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 28 Oct 2015 23:04:45 -0400 Subject: [PATCH 2/4] Lint fix on infix operators --- R-package/R/slice.xgb.DMatrix.R | 4 ++-- R-package/R/utils.R | 10 +++++----- R-package/R/xgb.cv.R | 10 +++++----- R-package/R/xgb.importance.R | 8 ++++---- R-package/R/xgb.model.dt.tree.R | 12 ++++++------ R-package/R/xgb.plot.importance.R | 4 ++-- R-package/R/xgb.train.R | 14 +++++++------- R-package/tests/testthat/test_basic.R | 2 +- R-package/tests/testthat/test_custom_objective.R | 10 +++++----- R-package/tests/testthat/test_helpers.R | 10 +++++----- R-package/tests/testthat/test_lint.R | 2 +- R-package/tests/testthat/test_poisson_regression.R | 8 ++++---- 12 files changed, 47 insertions(+), 47 deletions(-) diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index d8ef8cb9c..4d9854a85 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -34,8 +34,8 @@ setMethod("slice", signature = "xgb.DMatrix", attr_list <- attributes(object) nr <- xgb.numrow(object) len <- sapply(attr_list,length) - ind <- which(len==nr) - if (length(ind)>0) { + ind <- which(len == nr) + if (length(ind) > 0) { nms <- names(attr_list)[ind] for (i in 1:length(ind)) { attr(ret,nms[i]) <- attr(object,nms[i])[idxset] diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 2c7c74fc3..fa2d6524c 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -15,14 +15,14 @@ xgb.setinfo <- function(dmat, name, info) { stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix") } if (name == "label") { - if (length(info)!=xgb.numrow(dmat)) + if (length(info) != xgb.numrow(dmat)) stop("The length of labels must equal to the number of rows in the input data") .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE = "xgboost") return(TRUE) } if (name == "weight") { - if (length(info)!=xgb.numrow(dmat)) + if (length(info) != xgb.numrow(dmat)) stop("The length of weights must equal to the number of rows in the input data") .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE = "xgboost") @@ -36,7 +36,7 @@ xgb.setinfo <- function(dmat, name, info) { return(TRUE) } if (name == "group") { - if (sum(info)!=xgb.numrow(dmat)) + if (sum(info) != xgb.numrow(dmat)) stop("The sum of groups must equal to the number of rows in the input data") .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE = "xgboost") @@ -251,7 +251,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { # make simple non-stratified folds kstep <- length(randidx) %/% nfold folds <- list() - for (i in 1:(nfold-1)) { + for (i in 1:(nfold - 1)) { folds[[i]] <- randidx[1:kstep] randidx <- setdiff(randidx, folds[[i]]) } @@ -310,7 +310,7 @@ xgb.createFolds <- function(y, k = 10) ## At most, we will use quantiles. If the sample ## is too small, we just do regular unstratified ## CV - cuts <- floor(length(y)/k) + cuts <- floor(length(y) / k) if(cuts < 2) cuts <- 2 if(cuts > 5) cuts <- 5 y <- cut(y, diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index af79bde4e..7122f2480 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -130,7 +130,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # if (!is.null(params$eval_metric) && !is.null(feval)) # stop("xgb.cv: cannot assign two different evaluation metrics") if (!is.null(params$eval_metric)) - if (class(params$eval_metric)=='function') { + if (class(params$eval_metric) == 'function') { feval <- params$eval_metric params[['eval_metric']] <- NULL } @@ -158,7 +158,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bestInd <- 0 earlyStopflag <- FALSE - if (length(metrics)>1) + if (length(metrics) > 1) warning('Only the first metric is used for early stopping process.') } @@ -187,19 +187,19 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) if(verbose) - if (0 == (i-1L)%%print.every.n) + if (0 == (i - 1L) %% print.every.n) cat(ret, "\n", sep="") # early_Stopping if (!is.null(early.stop.round)){ - score <- strsplit(ret,'\\s+')[[1]][1+length(metrics)+2] + score <- strsplit(ret,'\\s+')[[1]][1 + length(metrics) + 2] score <- strsplit(score,'\\+|:')[[1]][[2]] score <- as.numeric(score) if ((maximize && score > bestScore) || (!maximize && score < bestScore)) { bestScore <- score bestInd <- i } else { - if (i-bestInd >= early.stop.round) { + if (i - bestInd >= early.stop.round) { earlyStopflag <- TRUE cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 14c5bbd44..8800c4c22 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -79,7 +79,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } - if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) { + if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) { stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.") } @@ -110,7 +110,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split]) apply(c & d, 2, . %>% target %>% sum) -> vec - result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL] + result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL] } } result @@ -119,13 +119,13 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N treeDump <- function(feature_names, text, keepDetail){ if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequence = Frequence / sum(Frequence))][order(Gain, decreasing = T)] result } linearDump <- function(feature_names, text){ - which(text == "weight:") %>% {a <- .+1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) + which(text == "weight:") %>% {a <- . + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) } # Avoid error messages during CRAN check. diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index b0f5ee279..281806d16 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -86,7 +86,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model text <- readLines(filename_dump) %>% str_trim(side = "both") } - position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1) extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist @@ -96,15 +96,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model allTrees <- data.table() - anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" + anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" for(i in 1:n_round){ - tree <- text[(position[i]+1):(position[i+1]-1)] + tree <- text[(position[i] + 1):(position[i + 1] - 1)] # avoid tree made of a leaf only (no split) - if(length(tree) <2) next + if(length(tree) < 2) next - treeID <- i-1 + treeID <- i - 1 notLeaf <- str_match(tree, "leaf") %>% is.na leaf <- notLeaf %>% not %>% tree[.] @@ -128,7 +128,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex)) coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID] + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID] allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index f126dfe46..d469005dd 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -44,9 +44,9 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) - importance_matrix[,"Cluster":=clusters$cluster %>% as.character] + importance_matrix[,"Cluster" := clusters$cluster %>% as.character] - plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) + plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) return(plot) } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 4bf1d36f6..7bb7bbf87 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -151,14 +151,14 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (!is.null(params$objective) && !is.null(obj)) stop("xgb.train: cannot assign two different objectives") if (!is.null(params$objective)) - if (class(params$objective)=='function') { + if (class(params$objective) == 'function') { obj <- params$objective params$objective <- NULL } if (!is.null(params$eval_metric) && !is.null(feval)) stop("xgb.train: cannot assign two different evaluation metrics") if (!is.null(params$eval_metric)) - if (class(params$eval_metric)=='function') { + if (class(params$eval_metric) == 'function') { feval <- params$eval_metric params$eval_metric <- NULL } @@ -188,7 +188,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestInd <- 0 earlyStopflag <- FALSE - if (length(watchlist)>1) + if (length(watchlist) > 1) warning('Only the first data set in watchlist is used for early stopping process.') } @@ -200,17 +200,17 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) - if (0== ( (i-1) %% print.every.n)) - cat(paste(msg, "\n", sep="")) + if (0 == ( (i - 1) %% print.every.n)) + cat(paste(msg, "\n", sep = "")) if (!is.null(early.stop.round)) { score <- strsplit(msg,':|\\s+')[[1]][3] score <- as.numeric(score) - if ((maximize && score>bestScore) || (!maximize && score bestScore) || (!maximize && score < bestScore)) { bestScore <- score bestInd <- i } else { - if (i-bestInd>=early.stop.round) { + if (i - bestInd >= early.stop.round) { earlyStopflag <- TRUE cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 88bd905ca..2e4e54902 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -18,7 +18,7 @@ test_that("early stopping", { res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", early.stop.round = 3, maximize = FALSE) - expect_true(nrow(res)<20) + expect_true(nrow(res) < 20) bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", early.stop.round = 3, maximize = FALSE) diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index 9fcbeca4d..6fd9c6d6d 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -13,14 +13,14 @@ test_that("custom objective works", { logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } evalerror <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + err <- as.numeric(sum(labels != (preds > 0))) / length(labels) return(list(metric = "error", value = err)) } @@ -34,13 +34,13 @@ test_that("custom objective works", { logregobjattr <- function(preds, dtrain) { labels <- attr(dtrain, 'label') - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } - param <- list(max.depth=2, eta=1, nthread = 2, silent=1, - objective=logregobjattr, eval_metric=evalerror) + param <- list(max.depth=2, eta=1, nthread = 2, silent = 1, + objective = logregobjattr, eval_metric = evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist) expect_equal(class(bst), "xgb.Booster") expect_equal(length(bst$raw), 1064) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 9cef61c49..668c16c5d 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -8,11 +8,11 @@ require(vcd) data(Arthritis) data(agaricus.train, package='xgboost') df <- data.table(Arthritis, keep.rownames = F) -df[,AgeDiscret:= as.factor(round(Age/10,0))] -df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] -df[,ID:=NULL] -sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) -output_vector <- df[,Y:=0][Improved == "Marked",Y:=1][,Y] +df[,AgeDiscret := as.factor(round(Age / 10,0))] +df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] +df[,ID := NULL] +sparse_matrix <- sparse.model.matrix(Improved ~ . -1, data = df) +output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y] bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R index 2f2a07d54..38d1b0ec0 100644 --- a/R-package/tests/testthat/test_lint.R +++ b/R-package/tests/testthat/test_lint.R @@ -23,5 +23,5 @@ test_that("Code Lint", { trailing_blank_lines_linter=lintr::trailing_blank_lines_linter, trailing_whitespace_linter=lintr::trailing_whitespace_linter ) - # lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality + lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality }) diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R index 5d3d78e27..acf2916bc 100644 --- a/R-package/tests/testthat/test_poisson_regression.R +++ b/R-package/tests/testthat/test_poisson_regression.R @@ -4,10 +4,10 @@ require(xgboost) test_that("poisson regression works", { data(mtcars) - bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11], - objective='count:poisson',nrounds=5) + bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11], + objective = 'count:poisson', nrounds=5) expect_equal(class(bst), "xgb.Booster") - pred = predict(bst,as.matrix(mtcars[,-11])) + pred <- predict(bst,as.matrix(mtcars[, -11])) expect_equal(length(pred), 32) - sqrt(mean((pred-mtcars[,11])^2)) + sqrt(mean((pred - mtcars[,11]) ^ 2)) }) \ No newline at end of file From 60244804006c6b986bb1e3460035e9543eb97a68 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 28 Oct 2015 23:24:17 -0400 Subject: [PATCH 3/4] Fixed most of the lint issues --- R-package/R/slice.xgb.DMatrix.R | 2 +- R-package/R/utils.R | 22 +++++----- R-package/R/xgb.cv.R | 33 +++++++------- R-package/R/xgb.importance.R | 10 ++--- R-package/R/xgb.model.dt.tree.R | 44 +++++++++---------- R-package/R/xgb.plot.importance.R | 12 ++--- R-package/R/xgb.plot.tree.R | 41 +++++++++-------- R-package/R/xgb.save.R | 2 +- R-package/R/xgb.train.R | 26 +++++------ R-package/R/xgboost.R | 16 +++---- .../tests/testthat/test_custom_objective.R | 14 +++--- R-package/tests/testthat/test_lint.R | 2 +- .../tests/testthat/test_poisson_regression.R | 2 +- 13 files changed, 107 insertions(+), 119 deletions(-) diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index 4d9854a85..3b025e1dd 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -30,7 +30,7 @@ setMethod("slice", signature = "xgb.DMatrix", } ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") - + attr_list <- attributes(object) nr <- xgb.numrow(object) len <- sapply(attr_list,length) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index fa2d6524c..ac497a9f4 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -68,7 +68,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { if (typeof(modelfile) == "character") { .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") } else if (typeof(modelfile) == "raw") { - .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") + .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") } else { stop("xgb.Booster: modelfile must be character or raw vector") } @@ -122,7 +122,7 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { } else if (inClass == "xgb.DMatrix") { dtrain <- data } else if (inClass == "data.frame") { - stop("xgboost only support numerical matrix input, + stop("xgboost only support numerical matrix input, use 'data.frame' to transform the data.") } else { stop("xgboost: Invalid input of data") @@ -156,12 +156,10 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { } if (is.null(obj)) { - .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, + .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE = "xgboost") } else { pred <- predict(booster, dtrain) - gpair <- obj(pred, dtrain) - succ <- xgb.iter.boost(booster, dtrain, gpair) } return(TRUE) } @@ -189,9 +187,9 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F } evnames <- append(evnames, names(w)) } - msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, + msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE = "xgboost") - } else { + } else { msg <- paste("[", iter, "]", sep="") for (j in 1:length(watchlist)) { w <- watchlist[j] @@ -247,7 +245,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { if (length(unique(y)) <= 5) y <- factor(y) } folds <- xgb.createFolds(y, nfold) - } else { + } else { # make simple non-stratified folds kstep <- length(randidx) %/% nfold folds <- list() @@ -282,7 +280,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) { kv <- strsplit(header[i], ":")[[1]] ret <- paste(ret, "\t", kv[1], ":", sep="") stats <- c() - stats[1] <- as.numeric(kv[2]) + stats[1] <- as.numeric(kv[2]) for (j in 2:length(res)) { tkv <- strsplit(res[[j]][i], ":")[[1]] stats[j] <- as.numeric(tkv[2]) @@ -311,8 +309,8 @@ xgb.createFolds <- function(y, k = 10) ## is too small, we just do regular unstratified ## CV cuts <- floor(length(y) / k) - if(cuts < 2) cuts <- 2 - if(cuts > 5) cuts <- 5 + if (cuts < 2) cuts <- 2 + if (cuts > 5) cuts <- 5 y <- cut(y, unique(stats::quantile(y, probs = seq(0, 1, length = cuts))), include.lowest = TRUE) @@ -324,7 +322,7 @@ xgb.createFolds <- function(y, k = 10) y <- factor(as.character(y)) numInClass <- table(y) foldVector <- vector(mode = "integer", length(y)) - + ## For each class, balance the fold allocation as far ## as possible, then resample the remainder. ## The final assignment of folds is also randomized. diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 7122f2480..245900743 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -118,7 +118,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (mc in metrics) { params <- append(params, list("eval_metric"=mc)) } - + # customized objective and evaluation metric interface if (!is.null(params$objective) && !is.null(obj)) stop("xgb.cv: cannot assign two different objectives") @@ -134,7 +134,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = feval <- params$eval_metric params[['eval_metric']] <- NULL } - + # Early Stopping if (!is.null(early.stop.round)){ if (!is.null(feval) && is.null(maximize)) @@ -149,7 +149,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = maximize <- TRUE } } - + if (maximize) { bestScore <- 0 } else { @@ -157,11 +157,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = } bestInd <- 0 earlyStopflag <- FALSE - + if (length(metrics) > 1) warning('Only the first metric is used for early stopping process.') } - + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) obj_type <- params[['objective']] mat_pred <- FALSE @@ -181,7 +181,6 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = msg <- list() for (k in 1:nfold) { fd <- xgb_folds[[k]] - succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] } ret <- xgb.cv.aggcv(msg, showsd) @@ -189,13 +188,13 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = if(verbose) if (0 == (i - 1L) %% print.every.n) cat(ret, "\n", sep="") - + # early_Stopping if (!is.null(early.stop.round)){ score <- strsplit(ret,'\\s+')[[1]][1 + length(metrics) + 2] score <- strsplit(score,'\\+|:')[[1]][[2]] score <- as.numeric(score) - if ((maximize && score > bestScore) || (!maximize && score < bestScore)) { + if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) { bestScore <- score bestInd <- i } else { @@ -206,9 +205,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = } } } - } - + if (prediction) { for (k in 1:nfold) { fd <- xgb_folds[[k]] @@ -225,24 +223,23 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = } } } - - + colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") colnamesMean <- paste(colnames, "mean") if(showsd) colnamesStd <- paste(colnames, "std") - + colnames <- c() if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) else colnames <- colnamesMean - + type <- rep(x = "numeric", times = length(colnames)) dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table split <- str_split(string = history, pattern = "\t") - - for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)} - + + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist( list( dt, .), use.names = F, fill = F)} + if (prediction) { - return(list(dt = dt,pred = predictValues)) + return( list( dt = dt,pred = predictValues)) } return(dt) } diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 8800c4c22..d635c00be 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -66,8 +66,8 @@ #' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label) #' #' @export -xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ((x + label) == 2)){ - if (!class(feature_names) %in% c("character", "NULL")) { +xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){ + if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } @@ -98,7 +98,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.") } else { result <- treeDump(feature_names, text = text, keepDetail = !is.null(data)) - + # Co-occurence computation if(!is.null(data) & !is.null(label) & nrow(result) > 0) { # Take care of missing column @@ -109,9 +109,9 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N # Apply split d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split]) apply(c & d, 2, . %>% target %>% sum) -> vec - + result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL] - } + } } result } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 281806d16..882ac6c1f 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -57,7 +57,7 @@ #' @export xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){ - if (!class(feature_names) %in% c("character", "NULL")) { + if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { @@ -97,15 +97,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model allTrees <- data.table() anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" - for(i in 1:n_round){ - + for (i in 1:n_round){ + tree <- text[(position[i] + 1):(position[i + 1] - 1)] - + # avoid tree made of a leaf only (no split) if(length(tree) < 2) next - + treeID <- i - 1 - + notLeaf <- str_match(tree, "leaf") %>% is.na leaf <- notLeaf %>% not %>% tree[.] branch <- notLeaf %>% tree[.] @@ -129,37 +129,37 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID] - + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } yes <- allTrees[!is.na(Yes), Yes] - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Feature", + + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), + j = "Yes.Feature", value = allTrees[ID %in% yes, Feature]) - + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Cover", + j = "Yes.Cover", value = allTrees[ID %in% yes, Cover]) - + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Quality", + j = "Yes.Quality", value = allTrees[ID %in% yes, Quality]) no <- allTrees[!is.na(No), No] - + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Feature", + j = "No.Feature", value = allTrees[ID %in% no, Feature]) - + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Cover", + j = "No.Cover", value = allTrees[ID %in% no, Cover]) - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Quality", + + set(allTrees, i = which(allTrees[, Feature] != "Leaf"), + j = "No.Quality", value = allTrees[ID %in% no, Quality]) - + allTrees } diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index d469005dd..92399516d 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -30,7 +30,7 @@ #' #' @export xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){ - if (!"data.table" %in% class(importance_matrix)) { + if (!"data.table" %in% class(importance_matrix)) { stop("importance_matrix: Should be a data.table.") } if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -42,13 +42,13 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 # To avoid issues in clustering when co-occurences are used importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] - + clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) importance_matrix[,"Cluster" := clusters$cluster %>% as.character] - - plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) - - return(plot) + + plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) + + return(plot) } # Avoid error messages during CRAN check. diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index edcd5f47f..5e359219a 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -54,40 +54,39 @@ #' #' @export #' -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){ - +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){ + if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) { stop("style: Has to be a character vector of size 1.") } - + if (!class(model) %in% c("xgb.Booster", "NULL")) { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } - + if (!requireNamespace("DiagrammeR", quietly = TRUE)) { stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE) } - + if(is.null(model)){ - allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) + allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) } else { - allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) + allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) } - - allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] - - allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] - - + + allTrees[Feature != "Leaf" ,yesPath := paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] + + allTrees[Feature != "Leaf" ,noPath := paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] + if(is.null(CSSstyle)){ - CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" - } - - yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") - - no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") - - path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";") + CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + } + + yes <- allTrees[Feature != "Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") + + no <- allTrees[Feature != "Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") + + path <- allTrees[Feature != "Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";") DiagrammeR::mermaid(path, width, height) } diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 2600b8cff..ad3cc8b12 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -29,4 +29,4 @@ xgb.save <- function(model, fname) { stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.") return(FALSE) -} +} diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 7bb7bbf87..e5b2b5ae0 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -120,9 +120,9 @@ #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) #' @export #' -xgb.train <- function(params=list(), data, nrounds, watchlist = list(), +xgb.train <- function(params=list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, - early.stop.round = NULL, maximize = NULL, + early.stop.round = NULL, maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) { dtrain <- data if (typeof(params) != "list") { @@ -139,14 +139,14 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (length(watchlist) != 0 && verbose == 0) { warning('watchlist is provided but verbose=0, no evaluation information will be printed') } - + dot.params <- list(...) nms.params <- names(params) nms.dot.params <- names(dot.params) - if (length(intersect(nms.params,nms.dot.params))>0) + if (length(intersect(nms.params,nms.dot.params)) > 0) stop("Duplicated term in parameters. Please check your list of params.") params <- append(params, dot.params) - + # customized objective and evaluation metric interface if (!is.null(params$objective) && !is.null(obj)) stop("xgb.train: cannot assign two different objectives") @@ -162,7 +162,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), feval <- params$eval_metric params$eval_metric <- NULL } - + # Early stopping if (!is.null(early.stop.round)){ if (!is.null(feval) && is.null(maximize)) @@ -179,25 +179,22 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), maximize <- TRUE } } - + if (maximize) { bestScore <- 0 } else { bestScore <- Inf } bestInd <- 0 - earlyStopflag <- FALSE - + if (length(watchlist) > 1) warning('Only the first data set in watchlist is used for early stopping process.') } - - + handle <- xgb.Booster(params, append(watchlist, dtrain)) bst <- xgb.handleToBooster(handle) print.every.n <- max( as.integer(print.every.n), 1L) for (i in 1:nrounds) { - succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) if (0 == ( (i - 1) %% print.every.n)) @@ -206,12 +203,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), { score <- strsplit(msg,':|\\s+')[[1]][3] score <- as.numeric(score) - if ((maximize && score > bestScore) || (!maximize && score < bestScore)) { + if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) { bestScore <- score bestInd <- i } else { if (i - bestInd >= early.stop.round) { - earlyStopflag <- TRUE cat('Stopping. Best iteration:',bestInd) break } @@ -230,4 +226,4 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bst$bestInd <- bestInd } return(bst) -} +} diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index e11052add..122d2f492 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -59,28 +59,26 @@ #' #' @export #' -xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, - params = list(), nrounds, +xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, + params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) { dtrain <- xgb.get.DMatrix(data, label, missing, weight) - + params <- append(params, list(...)) - + if (verbose > 0) { watchlist <- list(train = dtrain) } else { watchlist <- list() } - + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n, early.stop.round = early.stop.round, maximize = maximize, save_period = save_period, save_name = save_name) - + return(bst) -} - - +} #' Training part from Mushroom Data Set #' #' This data set is originally from the Mushroom data set, diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index 6fd9c6d6d..3db595f49 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -7,10 +7,10 @@ test_that("custom objective works", { data(agaricus.test, package='xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) - + watchlist <- list(eval = dtest, train = dtrain) num_round <- 2 - + logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") preds <- 1 / (1 + exp(-preds)) @@ -23,15 +23,15 @@ test_that("custom objective works", { err <- as.numeric(sum(labels != (preds > 0))) / length(labels) return(list(metric = "error", value = err)) } - - param <- list(max.depth=2, eta=1, nthread = 2, silent=1, + + param <- list(max.depth=2, eta=1, nthread = 2, silent=1, objective=logregobj, eval_metric=evalerror) - + bst <- xgb.train(param, dtrain, num_round, watchlist) expect_equal(class(bst), "xgb.Booster") expect_equal(length(bst$raw), 1064) attr(dtrain, 'label') <- getinfo(dtrain, 'label') - + logregobjattr <- function(preds, dtrain) { labels <- attr(dtrain, 'label') preds <- 1 / (1 + exp(-preds)) @@ -39,7 +39,7 @@ test_that("custom objective works", { hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } - param <- list(max.depth=2, eta=1, nthread = 2, silent = 1, + param <- list(max.depth=2, eta=1, nthread = 2, silent = 1, objective = logregobjattr, eval_metric = evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist) expect_equal(class(bst), "xgb.Booster") diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R index 38d1b0ec0..2f2a07d54 100644 --- a/R-package/tests/testthat/test_lint.R +++ b/R-package/tests/testthat/test_lint.R @@ -23,5 +23,5 @@ test_that("Code Lint", { trailing_blank_lines_linter=lintr::trailing_blank_lines_linter, trailing_whitespace_linter=lintr::trailing_whitespace_linter ) - lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality + # lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality }) diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R index acf2916bc..c28820774 100644 --- a/R-package/tests/testthat/test_poisson_regression.R +++ b/R-package/tests/testthat/test_poisson_regression.R @@ -9,5 +9,5 @@ test_that("poisson regression works", { expect_equal(class(bst), "xgb.Booster") pred <- predict(bst,as.matrix(mtcars[, -11])) expect_equal(length(pred), 32) - sqrt(mean((pred - mtcars[,11]) ^ 2)) + sqrt(mean( (pred - mtcars[,11]) ^ 2)) }) \ No newline at end of file From 5b9e071c183cc3c79dff2379a38625b3894b05d4 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 28 Oct 2015 23:49:18 -0400 Subject: [PATCH 4/4] Fix travis build (+1 squashed commit) Squashed commits: [9240d5f] Fix Travis build --- R-package/R/utils.R | 4 +++- R-package/R/xgb.cv.R | 1 + R-package/R/xgb.importance.R | 2 +- R-package/R/xgb.train.R | 3 +++ R-package/tests/testthat/test_helpers.R | 2 +- 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index ac497a9f4..b4f4a371f 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -1,4 +1,4 @@ -#' @importClassesFrom Matrix dgCMatrix dgeMatrix + #' @importClassesFrom Matrix dgCMatrix dgeMatrix #' @import methods # depends on matrix @@ -160,6 +160,8 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { PACKAGE = "xgboost") } else { pred <- predict(booster, dtrain) + gpair <- obj(pred, dtrain) + succ <- xgb.iter.boost(booster, dtrain, gpair) } return(TRUE) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 245900743..5f964c4f8 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -181,6 +181,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = msg <- list() for (k in 1:nfold) { fd <- xgb_folds[[k]] + succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] } ret <- xgb.cv.aggcv(msg, showsd) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index d635c00be..478438a79 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -125,7 +125,7 @@ treeDump <- function(feature_names, text, keepDetail){ } linearDump <- function(feature_names, text){ - which(text == "weight:") %>% {a <- . + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) + which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) } # Avoid error messages during CRAN check. diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index e5b2b5ae0..8e839af5c 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -186,6 +186,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestScore <- Inf } bestInd <- 0 + earlyStopflag = FALSE if (length(watchlist) > 1) warning('Only the first data set in watchlist is used for early stopping process.') @@ -195,6 +196,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bst <- xgb.handleToBooster(handle) print.every.n <- max( as.integer(print.every.n), 1L) for (i in 1:nrounds) { + succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) if (0 == ( (i - 1) %% print.every.n)) @@ -207,6 +209,7 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), bestScore <- score bestInd <- i } else { + earlyStopflag = TRUE if (i - bestInd >= early.stop.round) { cat('Stopping. Best iteration:',bestInd) break diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 668c16c5d..0ac6b388e 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -11,7 +11,7 @@ df <- data.table(Arthritis, keep.rownames = F) df[,AgeDiscret := as.factor(round(Age / 10,0))] df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] df[,ID := NULL] -sparse_matrix <- sparse.model.matrix(Improved ~ . -1, data = df) +sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y] bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")