fix early stopping and prediction

2015-06-21 19:46:31 -07:00 · 2015-06-21 19:46:31 -07:00 · 704d9e0a13
commit 704d9e0a13
parent 6b254ec495
1 changed files with 138 additions and 139 deletions
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@ -95,157 +95,156 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
                   prediction = FALSE, showsd = TRUE, metrics=list(), 
                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
                   early.stop.round = NULL, maximize = NULL, ...) {
-  if (typeof(params) != "list") {
+    if (typeof(params) != "list") {
-    stop("xgb.cv: first argument params must be list")
+        stop("xgb.cv: first argument params must be list")
  }
  if(!is.null(folds)) {
    if(class(folds)!="list" | length(folds) < 2) {
      stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
    }
-    nfold <- length(folds)
+    if(!is.null(folds)) {
-  }
+        if(class(folds)!="list" | length(folds) < 2) {
-  if (nfold <= 1) {
+            stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
-    stop("nfold must be bigger than 1")
+        }
-  }
+        nfold <- length(folds)
  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
    dtrain <- xgb.get.DMatrix(data, label, missing)
  }
  dot.params = list(...)
  nms.params = names(params)
  nms.dot.params = names(dot.params)
  if (length(intersect(nms.params,nms.dot.params))>0)
    stop("Duplicated defined term in parameters. Please check your list of params.")
  params <- append(params, dot.params)
  params <- append(params, list(silent=1))
  for (mc in metrics) {
    params <- append(params, list("eval_metric"=mc))
  }
  # customized objective and evaluation metric interface
  if (!is.null(params$objective) && !is.null(obj))
    stop("xgb.cv: cannot assign two different objectives")
  if (!is.null(params$objective))
    if (class(params$objective)=='function') {
      obj = params$objective
      params[['objective']] = NULL
    }
-  # if (!is.null(params$eval_metric) && !is.null(feval))
+    if (nfold <= 1) {
-  #  stop("xgb.cv: cannot assign two different evaluation metrics")
+        stop("nfold must be bigger than 1")
  if (!is.null(params$eval_metric))
    if (class(params$eval_metric)=='function') {
      feval = params$eval_metric
      params[['eval_metric']] = NULL
    }
-  
+    if (is.null(missing)) {
-  # Early Stopping
+        dtrain <- xgb.get.DMatrix(data, label)
  if (!is.null(early.stop.round)){
    if (!is.null(feval) && is.null(maximize))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize) && is.null(params$eval_metric))
      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
    if (is.null(maximize))
    {
      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
        maximize = FALSE
      } else {
        maximize = TRUE
      }
    }
    if (maximize) {
      bestScore = 0
    } else {
-      bestScore = Inf
+        dtrain <- xgb.get.DMatrix(data, label, missing)
    }
    dot.params = list(...)
    nms.params = names(params)
    nms.dot.params = names(dot.params)
    if (length(intersect(nms.params,nms.dot.params))>0)
        stop("Duplicated defined term in parameters. Please check your list of params.")
    params <- append(params, dot.params)
    params <- append(params, list(silent=1))
    for (mc in metrics) {
        params <- append(params, list("eval_metric"=mc))
    }
    bestInd = 0
    earlyStopflag = FALSE
-    if (length(metrics)>1)
+    # customized objective and evaluation metric interface
-      warning('Only the first metric is used for early stopping process.')
+    if (!is.null(params$objective) && !is.null(obj))
-  }
+        stop("xgb.cv: cannot assign two different objectives")
-  
+    if (!is.null(params$objective))
-  xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
+        if (class(params$objective)=='function') {
-  obj_type = params[['objective']]
+            obj = params$objective
-  mat_pred = FALSE
+            params[['objective']] = NULL
-  if (!is.null(obj_type) && obj_type=='multi:softprob')
+        }
-  {
+    # if (!is.null(params$eval_metric) && !is.null(feval))
-    num_class = params[['num_class']]
+    #  stop("xgb.cv: cannot assign two different evaluation metrics")
-    if (is.null(num_class))
+    if (!is.null(params$eval_metric))
-      stop('must set num_class to use softmax')
+        if (class(params$eval_metric)=='function') {
-    predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
+            feval = params$eval_metric
-    mat_pred = TRUE
+            params[['eval_metric']] = NULL
  }
  else
    predictValues <- rep(0,xgb.numrow(dtrain))
  history <- c()
  print.every.n = max(as.integer(print.every.n), 1L)
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
      fd <- xgb_folds[[k]]
      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
      if (i<nrounds) {
          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
      } else {
        if (!prediction) {
          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
        } else {
          res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
          if (mat_pred) {
            pred_mat = matrix(res[[2]],num_class,length(fd$index))
            predictValues[fd$index,] <- t(pred_mat)
          } else {
            predictValues[fd$index] <- res[[2]]
          }
          msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
        }
      }
    }
    ret <- xgb.cv.aggcv(msg, showsd)
    history <- c(history, ret)
    if(verbose)
      if (0==(i-1L)%%print.every.n)
        cat(ret, "\n", sep="")
-    # early_Stopping
+    # Early Stopping
    if (!is.null(early.stop.round)){
-      score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
+        if (!is.null(feval) && is.null(maximize))
-      score = strsplit(score,'\\+|:')[[1]][[2]]
+            stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
-      score = as.numeric(score)
+        if (is.null(maximize) && is.null(params$eval_metric))
-      if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
+            stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
-        bestScore = score
+        if (is.null(maximize))
-        bestInd = i
+        {
-      } else {
+            if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
-        if (i-bestInd>=early.stop.round) {
+                maximize = FALSE
-          earlyStopflag = TRUE
+            } else {
-          cat('Stopping. Best iteration:',bestInd)
+                maximize = TRUE
-          break
+            }
        }
-      }
+        
        if (maximize) {
            bestScore = 0
        } else {
            bestScore = Inf
        }
        bestInd = 0
        earlyStopflag = FALSE
        if (length(metrics)>1)
            warning('Only the first metric is used for early stopping process.')
    }
-  }
+    xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
-  
+    obj_type = params[['objective']]
-  colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
+    mat_pred = FALSE
-  colnamesMean <- paste(colnames, "mean")
+    if (!is.null(obj_type) && obj_type=='multi:softprob')
-  if(showsd) colnamesStd <- paste(colnames, "std")
+    {
-  
+        num_class = params[['num_class']]
-  colnames <- c()
+        if (is.null(num_class))
-  if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
+            stop('must set num_class to use softmax')
-  else colnames <- colnamesMean
+        predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
-  
+        mat_pred = TRUE
-  type <- rep(x = "numeric", times = length(colnames))
+    }
-  dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
+    else
-  split <- str_split(string = history, pattern = "\t")
+        predictValues <- rep(0,xgb.numrow(dtrain))
-  
+    history <- c()
-  for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
+    print.every.n = max(as.integer(print.every.n), 1L)
-  
+    for (i in 1:nrounds) {
-  if (prediction) {
+        msg <- list()
-    return(list(dt = dt,pred = predictValues))
+        for (k in 1:nfold) {
-  }
+            fd <- xgb_folds[[k]]
-  return(dt)
+            succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
            msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
        }
        ret <- xgb.cv.aggcv(msg, showsd)
        history <- c(history, ret)
        if(verbose)
            if (0==(i-1L)%%print.every.n)
                cat(ret, "\n", sep="")
        # early_Stopping
        if (!is.null(early.stop.round)){
            score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
            score = strsplit(score,'\\+|:')[[1]][[2]]
            score = as.numeric(score)
            if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
                bestScore = score
                bestInd = i
            } else {
                if (i-bestInd>=early.stop.round) {
                    earlyStopflag = TRUE
                    cat('Stopping. Best iteration:',bestInd)
                    break
                }
            }
        }
    }
    if (prediction) {
        for (k in 1:nfold) {
            fd = xgb_folds[[k]]
            res = xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
            if (mat_pred) {
                pred_mat = matrix(res[[2]],num_class,length(fd$index))
                predictValues[fd$index,] = t(pred_mat)
            } else {
                predictValues[fd$index] = res[[2]]
            }
        }
    }
    colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
    colnamesMean <- paste(colnames, "mean")
    if(showsd) colnamesStd <- paste(colnames, "std")
    colnames <- c()
    if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
    else colnames <- colnamesMean
    type <- rep(x = "numeric", times = length(colnames))
    dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
    split <- str_split(string = history, pattern = "\t")
    for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
    if (prediction) {
        return(list(dt = dt,pred = predictValues))
    }
    return(dt)
 }
 # Avoid error messages during CRAN check.