xgboost/R-package/R/xgb.DMatrix.R

#' Construct xgb.DMatrix object
#'
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
#' such as [xgb.train()] or [predict()].
#'
#' Function `xgb.QuantileDMatrix()` will construct a DMatrix with quantization for the histogram
#' method already applied to it, which can be used to reduce memory usage (compared to using a
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
#' (`tree_method = "approx"`).
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
#' for getting predictions out of a fitted model.
#'
#' Supported input types are as follows:\itemize{
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
#' was constructed.
#'
#' Other column types are not supported.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for
#' 'xgb.QuantileDMatrix'.
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
#' as a single row (only when making predictions from a fitted model).
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
#' the file, with an optional format specifier.
#'
#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
#'   \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
#'   \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
#'     `?format=libsvm` at the end of the file path. It will be the default format if not
#'     otherwise specified.
#'   \item CSV files (comma-separated values). This format can be specified by adding suffix
#'     `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions.
#'   }
#'
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
#' corresponds to the labels).
#'
#' For more information about passing text files as input, see the articles
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
#' }
#' @param label Label of the training data. For classification problems, should be passed encoded as
#' integers with numeration starting at zero.
#' @param weight Weight for each instance.
#'
#'   Note that, for ranking task, weights are per-group.  In ranking task, one weight
#'   is assigned to each group (not each data point). This is because we
#'   only care about the relative ordering of data points within each group,
#'   so it doesn't make sense to assign weights to individual data points.
#' @param base_margin Base margin used for boosting from existing model.
#'
#'   In the case of multi-output models, one can also pass multi-dimensional base_margin.
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
#'   from text files). It is useful to change when a zero, infinite, or some other
#'   extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param feature_names Set names for features. Overrides column names in data frame and matrix.
#'
#'   Note: columns are not referenced by name when calling `predict`, so the column order there
#'   must be the same as in the DMatrix construction, regardless of the column names.
#' @param feature_types Set types for features.
#'
#'   If `data` is a `data.frame` and passing `feature_types` is not supplied,
#'   feature types will be deduced automatically from the column types.
#'
#'   Otherwise, one can pass a character vector with the same length as number of columns in `data`,
#'   with the following possible values:
#'   - "c", which represents categorical columns.
#'   - "q", which represents numeric columns.
#'   - "int", which represents integer columns.
#'   - "i", which represents logical (boolean) columns.
#'
#'   Note that, while categorical types are treated differently from the rest for model fitting
#'   purposes, the other types do not influence the generated model, but have effects in other
#'   functionalities such as feature importances.
#'
#'   **Important**: Categorical features, if specified manually through `feature_types`, must
#'   be encoded as integers with numeration starting at zero, and the same encoding needs to be
#'   applied when passing data to [predict()]. Even if passing `factor` types, the encoding will
#'   not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
#'   whether to split by row or column. Allowed values are `"row"` and `"col"`.
#'
#'   In distributed mode, the file is split accordingly; otherwise this is only an indicator on
#'   how the file was split beforehand. Default to row.
#'
#'   This is not used when `data` is not a URI.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
#' @details
#' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()].
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
#' from the original source of data.
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' dtrain <- with(
#'   agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
#' )
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
#' xgb.DMatrix.save(dtrain, fname)
#' dtrain <- xgb.DMatrix(fname)
#' @export
#' @rdname xgb.DMatrix
xgb.DMatrix <- function(
  data,
  label = NULL,
  weight = NULL,
  base_margin = NULL,
  missing = NA,
  silent = FALSE,
  feature_names = colnames(data),
  feature_types = NULL,
  nthread = NULL,
  group = NULL,
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
  feature_weights = NULL,
  data_split_mode = "row"
) {
  if (!is.null(group) && !is.null(qid)) {
    stop("Either one of 'group' or 'qid' should be NULL")
  }
  nthread <- as.integer(NVL(nthread, -1L))
  if (typeof(data) == "character") {
    if (length(data) > 1) {
      stop(
        "'data' has class 'character' and length ", length(data),
        ".\n  'data' accepts either a numeric matrix or a single filename."
      )
    }
    data <- path.expand(data)
    if (data_split_mode == "row") {
      data_split_mode <- 0L
    } else if (data_split_mode == "col") {
      data_split_mode <- 1L
    } else {
      stop("Passed invalid 'data_split_mode': ", data_split_mode)
    }
    handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
  } else if (is.matrix(data)) {
    handle <- .Call(
      XGDMatrixCreateFromMat_R, data, missing, nthread
    )
  } else if (inherits(data, "dgCMatrix")) {
    handle <- .Call(
      XGDMatrixCreateFromCSC_R,
      data@p,
      data@i,
      data@x,
      nrow(data),
      missing,
      nthread
    )
  } else if (inherits(data, "dgRMatrix")) {
    handle <- .Call(
      XGDMatrixCreateFromCSR_R,
      data@p,
      data@j,
      data@x,
      ncol(data),
      missing,
      nthread
    )
  } else if (inherits(data, "dsparseVector")) {
    indptr <- c(0L, as.integer(length(data@i)))
    ind <- as.integer(data@i) - 1L
    handle <- .Call(
      XGDMatrixCreateFromCSR_R,
      indptr,
      ind,
      data@x,
      length(data),
      missing,
      nthread
    )
  } else if (is.data.frame(data)) {
    tmp <- .process.df.for.dmatrix(data, feature_types)
    feature_types <- tmp$feature_types
    handle <- .Call(
      XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
    )
    rm(tmp)
  } else {
    stop("xgb.DMatrix does not support construction from ", typeof(data))
  }

  dmat <- handle
  attributes(dmat) <- list(
    class = "xgb.DMatrix",
    fields = new.env()
  )
  .set.dmatrix.fields(
    dmat = dmat,
    label = label,
    weight = weight,
    base_margin = base_margin,
    feature_names = feature_names,
    feature_types = feature_types,
    group = group,
    qid = qid,
    label_lower_bound = label_lower_bound,
    label_upper_bound = label_upper_bound,
    feature_weights = feature_weights
  )

  return(dmat)
}

.process.df.for.dmatrix <- function(df, feature_types) {
  if (!nrow(df) || !ncol(df)) {
    stop("'data' is an empty data.frame.")
  }
  if (!is.null(feature_types)) {
    if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
      stop(
        "'feature_types' must be a character vector with one entry per column in 'data'."
      )
    }
  } else {
    feature_types <- sapply(df, function(col) {
      if (is.factor(col)) {
        return("c")
      } else if (is.integer(col)) {
        return("int")
      } else if (is.logical(col)) {
        return("i")
      } else {
        if (!is.numeric(col)) {
          stop("Invalid type in dataframe.")
        }
        return("float")
      }
    })
  }

  lst <- lapply(df, function(col) {
    is_factor <- is.factor(col)
    col <- as.numeric(col)
    if (is_factor) {
      col <- col - 1
    }
    return(col)
  })

  return(list(lst = lst, feature_types = feature_types))
}

.set.dmatrix.fields <- function(
  dmat,
  label,
  weight,
  base_margin,
  feature_names,
  feature_types,
  group,
  qid,
  label_lower_bound,
  label_upper_bound,
  feature_weights
) {
  if (!is.null(label)) {
    setinfo(dmat, "label", label)
  }
  if (!is.null(weight)) {
    setinfo(dmat, "weight", weight)
  }
  if (!is.null(base_margin)) {
    setinfo(dmat, "base_margin", base_margin)
  }
  if (!is.null(feature_names)) {
    setinfo(dmat, "feature_name", feature_names)
  }
  if (!is.null(feature_types)) {
    setinfo(dmat, "feature_type", feature_types)
  }
  if (!is.null(group)) {
    setinfo(dmat, "group", group)
  }
  if (!is.null(qid)) {
    setinfo(dmat, "qid", qid)
  }
  if (!is.null(label_lower_bound)) {
    setinfo(dmat, "label_lower_bound", label_lower_bound)
  }
  if (!is.null(label_upper_bound)) {
    setinfo(dmat, "label_upper_bound", label_upper_bound)
  }
  if (!is.null(feature_weights)) {
    setinfo(dmat, "feature_weights", feature_weights)
  }
}

#' @param ref The training dataset that provides quantile information, needed when creating
#' validation/test dataset with [xgb.QuantileDMatrix()]. Supplying the training DMatrix
#' as a reference means that the same quantisation applied to the training data is
#' applied to the validation/test data
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
#'   `max_bin`.
#'
#'   This is only supported when constructing a QuantileDMatrix.
#' @export
#' @rdname xgb.DMatrix
xgb.QuantileDMatrix <- function(
  data,
  label = NULL,
  weight = NULL,
  base_margin = NULL,
  missing = NA,
  feature_names = colnames(data),
  feature_types = NULL,
  nthread = NULL,
  group = NULL,
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
  feature_weights = NULL,
  ref = NULL,
  max_bin = NULL
) {
  nthread <- as.integer(NVL(nthread, -1L))
  if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
    stop("'ref' must be an xgb.DMatrix object.")
  }

  # Note: when passing an integer matrix, it won't get casted to numeric.
  # Since 'int' values as understood by languages like C cannot have missing values,
  # R represents missingness there by assigning them a value equal to the minimum
  # integer. The 'missing' value here is set before the data, so in case of integers,
  # need to make the conversion manually beforehand.
  if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
    missing <- .Call(XGGetRNAIntAsDouble)
  }

  iterator_env <- as.environment(
    list(
      data = data,
      label = label,
      weight = weight,
      base_margin = base_margin,
      missing = missing,
      feature_names = feature_names,
      feature_types = feature_types,
      group = group,
      qid = qid,
      label_lower_bound = label_lower_bound,
      label_upper_bound = label_upper_bound,
      feature_weights = feature_weights
    )
  )
  data_iterator <- .single.data.iterator(iterator_env)

  # Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
  # object, but that finalizer will only be called once the object is
  # garbage-collected, which doesn't happen immediately after it goes out
  # of scope, hence this piece of code to tigger its destruction earlier
  # and free memory right away.
  proxy_handle <- .make.proxy.handle()
  on.exit({
    .Call(XGDMatrixFree_R, proxy_handle)
  })
  iterator_next <- function() {
    return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
  }
  iterator_reset <- function() {
    return(data_iterator$f_reset(iterator_env))
  }
  calling_env <- environment()

  dmat <- .Call(
    XGQuantileDMatrixCreateFromCallback_R,
    iterator_next,
    iterator_reset,
    calling_env,
    proxy_handle,
    nthread,
    missing,
    max_bin,
    ref
  )
  attributes(dmat) <- list(
    class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
    fields = attributes(proxy_handle)$fields
  )
  return(dmat)
}

#' XGBoost Data Iterator
#'
#' @description
#' Interface to create a custom data iterator in order to construct a DMatrix
#' from external memory.
#'
#' This function is responsible for generating an R object structure containing callback
#' functions and an environment shared with them.
#'
#' The output structure from this function is then meant to be passed to [xgb.ExtMemDMatrix()],
#' which will consume the data and create a DMatrix from it by executing the callback functions.
#'
#' For more information, and for a usage example, see the documentation for [xgb.ExtMemDMatrix()].
#'
#' @param env An R environment to pass to the callback functions supplied here, which can be
#'   used to keep track of variables to determine how to handle the batches.
#'
#'   For example, one might want to keep track of an iteration number in this environment in order
#'   to know which part of the data to pass next.
#' @param f_next `function(env)` which is responsible for:
#'   - Accessing or retrieving the next batch of data in the iterator.
#'   - Supplying this data by calling function [xgb.DataBatch()] on it and returning the result.
#'   - Keeping track of where in the iterator batch it is or will go next, which can for example
#'     be done by modifiying variables in the `env` variable that is passed here.
#'   - Signaling whether there are more batches to be consumed or not, by returning `NULL`
#'     when the stream of data ends (all batches in the iterator have been consumed), or the result from
#'     calling [xgb.DataBatch()] when there are more batches in the line to be consumed.
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
#'   (i.e. taking it back to the first batch, called before and after the sequence of batches
#'   has been consumed).
#'
#'   Note that, after resetting the iterator, the batches will be accessed again, so the same data
#'   (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#'   be passed to [xgb.ExtMemDMatrix()].
#' @seealso [xgb.ExtMemDMatrix()], [xgb.DataBatch()].
#' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
  if (!is.function(f_next)) {
    stop("'f_next' must be a function.")
  }
  if (!is.function(f_reset)) {
    stop("'f_reset' must be a function.")
  }
  out <- list(
    env = env,
    f_next = f_next,
    f_reset = f_reset
  )
  class(out) <- "xgb.DataIter"
  return(out)
}

.qdm.single.fnext <- function(env) {
  curr_iter <- env[["iter"]]
  if (curr_iter >= 1L) {
    return(NULL)
  }

  on.exit({
    env[["iter"]] <- curr_iter + 1L
  })
  return(
    xgb.DataBatch(
      data = env[["data"]],
      label = env[["label"]],
      weight = env[["weight"]],
      base_margin = env[["base_margin"]],
      feature_names = env[["feature_names"]],
      feature_types = env[["feature_types"]],
      group = env[["group"]],
      qid = env[["qid"]],
      label_lower_bound = env[["label_lower_bound"]],
      label_upper_bound = env[["label_upper_bound"]],
      feature_weights = env[["feature_weights"]]
    )
  )
}

.qdm.single.freset <- function(env) {
  env[["iter"]] <- 0L
  return(invisible(NULL))
}

.single.data.iterator <- function(env) {
  env[["iter"]] <- 0L
  return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
}

# Only for internal usage
.make.proxy.handle <- function() {
  out <- .Call(XGProxyDMatrixCreate_R)
  attributes(out) <- list(
    class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
    fields = new.env()
  )
  return(out)
}

#' Structure for Data Batches
#'
#' @description
#' Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through [xgb.ExtMemDMatrix()]
#' or through [xgb.QuantileDMatrix.from_iterator()].
#'
#' This function is **only** meant to be called inside of a callback function (which
#' is passed as argument to function [xgb.DataIter()] to construct a data iterator)
#' when constructing a DMatrix through external memory - otherwise, one should call
#' [xgb.DMatrix()] or [xgb.QuantileDMatrix()].
#'
#' The object that results from calling this function directly is **not** like
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#'
#' For more information and for example usage, see the documentation for [xgb.ExtMemDMatrix()].
#' @inheritParams xgb.DMatrix
#' @param data Batch of data belonging to this batch.
#'
#'   Note that not all of the input types supported by [xgb.DMatrix()] are possible
#'   to pass here. Supported types are:
#'   - `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
#'     `integer` and `logical`, missing values might not be automatically recognized as
#'     as such - see the documentation for parameter `missing` in [xgb.ExtMemDMatrix()]
#'     for details on this.
#'   - `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
#'     conversions applied to it. See the documentation for parameter `data` in
#'     [xgb.DMatrix()] for details on it.
#'   - CSR matrices, as class `dgRMatrix` from package "Matrix".
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
#'   data and parameters passed here. It does **not** inherit from `xgb.DMatrix`.
#' @seealso [xgb.DataIter()], [xgb.ExtMemDMatrix()].
#' @export
xgb.DataBatch <- function(
  data,
  label = NULL,
  weight = NULL,
  base_margin = NULL,
  feature_names = colnames(data),
  feature_types = NULL,
  group = NULL,
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
  feature_weights = NULL
) {
  stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
  out <- list(
    data = data,
    label = label,
    weight = weight,
    base_margin = base_margin,
    feature_names = feature_names,
    feature_types = feature_types,
    group = group,
    qid = qid,
    label_lower_bound = label_lower_bound,
    label_upper_bound = label_upper_bound,
    feature_weights = feature_weights
  )
  class(out) <- "xgb.DataBatch"
  return(out)
}

# This is only for internal usage, class is not exposed to the user.
xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
  lst <- data_iterator$f_next(data_iterator$env)
  if (is.null(lst)) {
    return(0L)
  }
  if (!inherits(lst, "xgb.DataBatch")) {
    stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
  }

  if (!is.null(lst$group) && !is.null(lst$qid)) {
    stop("Either one of 'group' or 'qid' should be NULL")
  }
  if (is.data.frame(lst$data)) {
    tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
    lst$feature_types <- tmp$feature_types
    .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
    rm(tmp)
  } else if (is.matrix(lst$data)) {
    .Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
  } else if (inherits(lst$data, "dgRMatrix")) {
    tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
    .Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
  } else {
    stop("'data' has unsupported type.")
  }

  .set.dmatrix.fields(
    dmat = proxy_handle,
    label = lst$label,
    weight = lst$weight,
    base_margin = lst$base_margin,
    feature_names = lst$feature_names,
    feature_types = lst$feature_types,
    group = lst$group,
    qid = lst$qid,
    label_lower_bound = lst$label_lower_bound,
    label_upper_bound = lst$label_upper_bound,
    feature_weights = lst$feature_weights
  )

  return(1L)
}

#' DMatrix from External Data
#'
#' @description
#' Create a special type of XGBoost 'DMatrix' object from external data
#' supplied by an [xgb.DataIter()] object, potentially passed in batches from a
#' bigger set that might not fit entirely in memory.
#'
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
#' without being concatenated, but note that fields like 'label' **will** be
#' concatenated from multiple calls to the data iterator.
#'
#' For more information, see the guide 'Using XGBoost External Memory Version':
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
#' @inheritParams xgb.DMatrix
#' @param data_iterator A data iterator structure as returned by [xgb.DataIter()],
#'   which includes an environment shared between function calls, and functions to access
#'   the data in batches on-demand.
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
#' @param missing A float value to represents missing values in data.
#'
#'   Note that, while functions like [xgb.DMatrix()] can take a generic `NA` and interpret it
#'   correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
#'   it will not be adapted for different input types.
#'
#'   For example, in R `integer` types, missing values are represented by integer number `-2147483648`
#'   (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
#'   which is interpreted as a floating-point NaN by [xgb.ExtMemDMatrix()] and by
#'   [xgb.QuantileDMatrix.from_iterator()], these integer missing values will not be treated as missing.
#'   This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
#'   held internally but accessed through the iterator when needed.
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.QuantileDMatrix.from_iterator()]
#' @examples
#' data(mtcars)
#'
#' # This custom environment will be passed to the iterator
#' # functions at each call. It is up to the user to keep
#' # track of the iteration number in this environment.
#' iterator_env <- as.environment(
#'   list(
#'     iter = 0,
#'     x = mtcars[, -1],
#'     y = mtcars[, 1]
#'   )
#' )
#'
#' # Data is passed in two batches.
#' # In this example, batches are obtained by subsetting the 'x' variable.
#' # This is not advantageous to do, since the data is already loaded in memory
#' # and can be passed in full in one go, but there can be situations in which
#' # only a subset of the data will fit in the computer's memory, and it can
#' # be loaded in batches that are accessed one-at-a-time only.
#' iterator_next <- function(iterator_env) {
#'   curr_iter <- iterator_env[["iter"]]
#'   if (curr_iter >= 2) {
#'     # there are only two batches, so this signals end of the stream
#'     return(NULL)
#'   }
#'
#'   if (curr_iter == 0) {
#'     x_batch <- iterator_env[["x"]][1:16, ]
#'     y_batch <- iterator_env[["y"]][1:16]
#'   } else {
#'     x_batch <- iterator_env[["x"]][17:32, ]
#'     y_batch <- iterator_env[["y"]][17:32]
#'   }
#'   on.exit({
#'     iterator_env[["iter"]] <- curr_iter + 1
#'   })
#'
#'   # Function 'xgb.DataBatch' must be called manually
#'   # at each batch with all the appropriate attributes,
#'   # such as feature names and feature types.
#'   return(xgb.DataBatch(data = x_batch, label = y_batch))
#' }
#'
#' # This moves the iterator back to its beginning
#' iterator_reset <- function(iterator_env) {
#'   iterator_env[["iter"]] <- 0
#' }
#'
#' data_iterator <- xgb.DataIter(
#'   env = iterator_env,
#'   f_next = iterator_next,
#'   f_reset = iterator_reset
#' )
#' cache_prefix <- tempdir()
#'
#' # DMatrix will be constructed from the iterator's batches
#' dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
#'
#' # After construction, can be used as a regular DMatrix
#' params <- list(nthread = 1, objective = "reg:squarederror")
#' model <- xgb.train(data = dm, nrounds = 2, params = params)
#'
#' # Predictions can also be called on it, and should be the same
#' # as if the data were passed differently.
#' pred_dm <- predict(model, dm)
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
#' @export
xgb.ExtMemDMatrix <- function(
  data_iterator,
  cache_prefix = tempdir(),
  missing = NA,
  nthread = NULL
) {
  stopifnot(inherits(data_iterator, "xgb.DataIter"))
  stopifnot(is.character(cache_prefix))

  cache_prefix <- path.expand(cache_prefix)
  nthread <- as.integer(NVL(nthread, -1L))

  proxy_handle <- .make.proxy.handle()
  on.exit({
    .Call(XGDMatrixFree_R, proxy_handle)
  })
  iterator_next <- function() {
    return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
  }
  iterator_reset <- function() {
    return(data_iterator$f_reset(data_iterator$env))
  }
  calling_env <- environment()

  dmat <- .Call(
    XGDMatrixCreateFromCallback_R,
    iterator_next,
    iterator_reset,
    calling_env,
    proxy_handle,
    nthread,
    missing,
    cache_prefix
  )

  attributes(dmat) <- list(
    class = c("xgb.DMatrix", "xgb.ExtMemDMatrix"),
    fields = attributes(proxy_handle)$fields
  )
  return(dmat)
}


#' QuantileDMatrix from External Data
#'
#' @description
#' Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
#' calling function [xgb.QuantileDMatrix()], with the same advantages and limitations) from
#' external data supplied by [xgb.DataIter()], potentially passed in batches from
#' a bigger set that might not fit entirely in memory, same way as [xgb.ExtMemDMatrix()].
#'
#' Note that, while external data will only be loaded through the iterator (thus the full data
#' might not be held entirely in-memory), the quantized representation of the data will get
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
#' version is typically lighter than the original data, so there might be cases in which this
#' representation could potentially fit in memory even if the full data does not.
#'
#' For more information, see the guide 'Using XGBoost External Memory Version':
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
#' @inheritParams xgb.ExtMemDMatrix
#' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.ExtMemDMatrix()],
#' [xgb.QuantileDMatrix()]
#' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint
  data_iterator,
  missing = NA,
  nthread = NULL,
  ref = NULL,
  max_bin = NULL
) {
  stopifnot(inherits(data_iterator, "xgb.DataIter"))
  if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
    stop("'ref' must be an xgb.DMatrix object.")
  }

  nthread <- as.integer(NVL(nthread, -1L))

  proxy_handle <- .make.proxy.handle()
  on.exit({
    .Call(XGDMatrixFree_R, proxy_handle)
  })
  iterator_next <- function() {
    return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
  }
  iterator_reset <- function() {
    return(data_iterator$f_reset(data_iterator$env))
  }
  calling_env <- environment()

  dmat <- .Call(
    XGQuantileDMatrixCreateFromCallback_R,
    iterator_next,
    iterator_reset,
    calling_env,
    proxy_handle,
    nthread,
    missing,
    max_bin,
    ref
  )

  attributes(dmat) <- list(
    class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
    fields = attributes(proxy_handle)$fields
  )
  return(dmat)
}

#' Check whether DMatrix object has a field
#'
#' Checks whether an xgb.DMatrix object has a given field assigned to
#' it, such as weights, labels, etc.
#' @param object The DMatrix object to check for the given `info` field.
#' @param info The field to check for presence or absence in `object`.
#' @seealso [xgb.DMatrix()], [getinfo.xgb.DMatrix()], [setinfo.xgb.DMatrix()]
#' @examples
#' x <- matrix(1:10, nrow = 5)
#' dm <- xgb.DMatrix(x, nthread = 1)
#'
#' # 'dm' so far does not have any fields set
#' xgb.DMatrix.hasinfo(dm, "label")
#'
#' # Fields can be added after construction
#' setinfo(dm, "label", 1:5)
#' xgb.DMatrix.hasinfo(dm, "label")
#' @export
xgb.DMatrix.hasinfo <- function(object, info) {
  if (!inherits(object, "xgb.DMatrix")) {
    stop("Object is not an 'xgb.DMatrix'.")
  }
  if (.Call(XGCheckNullPtr_R, object)) {
    warning("xgb.DMatrix object is invalid. Must be constructed again.")
    return(FALSE)
  }
  return(NVL(attr(object, "fields")[[info]], FALSE))
}


#' Dimensions of xgb.DMatrix
#'
#' Returns a vector of numbers of rows and of columns in an `xgb.DMatrix`.
#'
#' @param x Object of class `xgb.DMatrix`
#'
#' @details
#' Note: since [nrow()] and [ncol()] internally use [dim()], they can also
#' be directly used with an `xgb.DMatrix` object.
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#'
#' @export
dim.xgb.DMatrix <- function(x) {
  c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x))
}


#' Handling of column names of `xgb.DMatrix`
#'
#' Only column names are supported for `xgb.DMatrix`, thus setting of
#' row names would have no effect and returned row names would be `NULL`.
#'
#' @param x Object of class `xgb.DMatrix`.
#' @param value A list of two elements: the first one is ignored
#'   and the second one is column names
#'
#' @details
#' Generic [dimnames()] methods are used by [colnames()].
#' Since row names are irrelevant, it is recommended to use [colnames()] directly.
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose = TRUE)
#'
#' @rdname dimnames.xgb.DMatrix
#' @export
dimnames.xgb.DMatrix <- function(x) {
  fn <- getinfo(x, "feature_name")
  ## row names is null.
  list(NULL, fn)
}

#' @rdname dimnames.xgb.DMatrix
#' @export
`dimnames<-.xgb.DMatrix` <- function(x, value) {
  if (!is.list(value) || length(value) != 2L)
    stop("invalid 'dimnames' given: must be a list of two elements")
  if (!is.null(value[[1L]]))
    stop("xgb.DMatrix does not have rownames")
  if (is.null(value[[2]])) {
    setinfo(x, "feature_name", NULL)
    return(x)
  }
  if (ncol(x) != length(value[[2]])) {
    stop("can't assign ", length(value[[2]]), " colnames to a ", ncol(x), " column xgb.DMatrix")
  }
  setinfo(x, "feature_name", value[[2]])
  x
}


#' Get or set information of xgb.DMatrix and xgb.Booster objects
#'
#' @param object Object of class `xgb.DMatrix` or `xgb.Booster`.
#' @param name The name of the information field to get (see details).
#' @return For `getinfo()`, will return the requested field. For `setinfo()`,
#'   will always return value `TRUE` if it succeeds.
#' @details
#' The `name` field can be one of the following for `xgb.DMatrix`:
#' - label
#' - weight
#' - base_margin
#' - label_lower_bound
#' - label_upper_bound
#' - group
#' - feature_type
#' - feature_name
#' - nrow
#'
#' See the documentation for [xgb.DMatrix()] for more information about these fields.
#'
#' For `xgb.Booster`, can be one of the following:
#' - `feature_type`
#' - `feature_name`
#'
#' Note that, while 'qid' cannot be retrieved, it is possible to get the equivalent 'group'
#' for a DMatrix that had 'qid' assigned.
#'
#' **Important**: when calling [setinfo()], the objects are modified in-place. See
#' [xgb.copy.Booster()] for an idea of this in-place assignment works.
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' labels <- getinfo(dtrain, "label")
#' setinfo(dtrain, "label", 1 - labels)
#'
#' labels2 <- getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' @rdname getinfo
#' @export
getinfo <- function(object, name) UseMethod("getinfo")

#' @rdname getinfo
#' @export
getinfo.xgb.DMatrix <- function(object, name) {
  allowed_int_fields <- 'group'
  allowed_float_fields <- c(
    'label', 'weight', 'base_margin',
    'label_lower_bound', 'label_upper_bound'
  )
  allowed_str_fields <- c("feature_type", "feature_name")
  allowed_fields <- c(allowed_float_fields, allowed_int_fields, allowed_str_fields, 'nrow')

  if (typeof(name) != "character" ||
        length(name) != 1 ||
        !name %in% allowed_fields) {
    stop("getinfo: name must be one of the following\n",
         paste(paste0("'", allowed_fields, "'"), collapse = ", "))
  }
  if (name == "nrow") {
    ret <- nrow(object)
  } else if (name %in% allowed_str_fields) {
    ret <- .Call(XGDMatrixGetStrFeatureInfo_R, object, name)
  } else if (name %in% allowed_float_fields) {
    ret <- .Call(XGDMatrixGetFloatInfo_R, object, name)
    if (length(ret) > nrow(object)) {
      ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
    }
  } else if (name %in% allowed_int_fields) {
    if (name == "group") {
      name <- "group_ptr"
    }
    ret <- .Call(XGDMatrixGetUIntInfo_R, object, name)
    if (length(ret) > nrow(object)) {
      ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
    }
  }
  if (length(ret) == 0) return(NULL)
  return(ret)
}

#' @rdname getinfo
#' @param info The specific field of information to set.
#'
#' @details
#' See the documentation for [xgb.DMatrix()] for possible fields that can be set
#' (which correspond to arguments in that function).
#'
#' Note that the following fields are allowed in the construction of an `xgb.DMatrix`
#' but **are not** allowed here:
#' - data
#' - missing
#' - silent
#' - nthread
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' labels <- getinfo(dtrain, "label")
#' setinfo(dtrain, "label", 1 - labels)
#'
#' labels2 <- getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' @export
setinfo <- function(object, name, info) UseMethod("setinfo")

#' @rdname getinfo
#' @export
setinfo.xgb.DMatrix <- function(object, name, info) {
  .internal.setinfo.xgb.DMatrix(object, name, info)
  attr(object, "fields")[[name]] <- TRUE
  return(TRUE)
}

.internal.setinfo.xgb.DMatrix <- function(object, name, info) {
  if (name == "label") {
    if (NROW(info) != nrow(object))
      stop("The length of labels must equal to the number of rows in the input data")
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "label_lower_bound") {
    if (NROW(info) != nrow(object))
      stop("The length of lower-bound labels must equal to the number of rows in the input data")
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "label_upper_bound") {
    if (NROW(info) != nrow(object))
      stop("The length of upper-bound labels must equal to the number of rows in the input data")
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "weight") {
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "base_margin") {
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "group") {
    if (sum(info) != nrow(object))
      stop("The sum of groups must equal to the number of rows in the input data")
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "qid") {
    if (NROW(info) != nrow(object))
      stop("The length of qid assignments must equal to the number of rows in the input data")
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }
  if (name == "feature_weights") {
    if (NROW(info) != ncol(object)) {
      stop("The number of feature weights must equal to the number of columns in the input data")
    }
    .Call(XGDMatrixSetInfo_R, object, name, info)
    return(TRUE)
  }

  set_feat_info <- function(name) {
    msg <- sprintf(
      "The number of %s must equal to the number of columns in the input data. %s vs. %s",
      name,
      length(info),
      ncol(object)
    )
    if (!is.null(info)) {
      info <- as.list(info)
      if (length(info) != ncol(object)) {
        stop(msg)
      }
    }
    .Call(XGDMatrixSetStrFeatureInfo_R, object, name, info)
  }
  if (name == "feature_name") {
    set_feat_info("feature_name")
    return(TRUE)
  }
  if (name == "feature_type") {
    set_feat_info("feature_type")
    return(TRUE)
  }
  stop("setinfo: unknown info name ", name)
}

#' Get Quantile Cuts from DMatrix
#'
#' @description
#' Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
#' that has been quantized for the histogram method (`tree_method = "hist"`).
#'
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
#' As such, the first and last bin will be outside of the range of the data, so as to include
#' all of the observations there.
#'
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
#' which will be output in sorted order from lowest to highest.
#'
#' Different columns can have different numbers of bins according to their range.
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
#' @param output Output format for the quantile cuts. Possible options are:
#'   - "list"` will return the output as a list with one entry per column, where
#'     each column will have a numeric vector with the cuts. The list will be named if
#'     `dmat` has column names assigned to it.
#'   - `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
#'     `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
#' `   indptr[i]+1` to `indptr[i+1]`.
#' @return The quantile cuts, in the format specified by parameter `output`.
#' @examples
#' data(mtcars)
#'
#' y <- mtcars$mpg
#' x <- as.matrix(mtcars[, -1])
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
#'
#' # DMatrix is not quantized right away, but will be once a hist model is generated
#' model <- xgb.train(
#'   data = dm,
#'   params = list(tree_method = "hist", max_bin = 8, nthread = 1),
#'   nrounds = 3
#' )
#'
#' # Now can get the quantile cuts
#' xgb.get.DMatrix.qcut(dm)
#' @export
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
  stopifnot(inherits(dmat, "xgb.DMatrix"))
  output <- head(output, 1L)
  stopifnot(output %in% c("list", "arrays"))
  res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
  if (output == "arrays") {
    return(res)
  } else {
    feature_names <- getinfo(dmat, "feature_name")
    ncols <- length(res$indptr) - 1
    out <- lapply(
      seq(1, ncols),
      function(col) {
        st <- res$indptr[col]
        end <- res$indptr[col + 1]
        if (end <= st) {
          return(numeric())
        }
        return(res$data[seq(1 + st, end)])
      }
    )
    if (NROW(feature_names)) {
      names(out) <- feature_names
    }
    return(out)
  }
}

#' Get Number of Non-Missing Entries in DMatrix
#'
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
#' @return The number of non-missing entries in the DMatrix.
#' @export
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
  stopifnot(inherits(dmat, "xgb.DMatrix"))
  return(.Call(XGDMatrixNumNonMissing_R, dmat))
}

#' Get DMatrix Data
#'
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
#' from package `Matrix`). If it had feature names, these will be added as column names
#' in the output.
#' @export
xgb.get.DMatrix.data <- function(dmat) {
  stopifnot(inherits(dmat, "xgb.DMatrix"))
  res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
  out <- methods::new("dgRMatrix")
  nrows <- as.integer(length(res$indptr) - 1)
  out@p <- res$indptr
  out@j <- res$indices
  out@x <- res$data
  out@Dim <- as.integer(c(nrows, res$ncols))

  feature_names <- getinfo(dmat, "feature_name")
  dim_names <- list(NULL, NULL)
  if (NROW(feature_names)) {
    dim_names[[2L]] <- feature_names
  }
  out@Dimnames <- dim_names
  return(out)
}

#' Slice DMatrix
#'
#' Get a new DMatrix containing the specified rows of original xgb.DMatrix object.
#'
#' @param object Object of class `xgb.DMatrix`.
#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
#'   equivalently `qid`) field. Note that in such case, the result will not have
#'   the groups anymore - they need to be set manually through [setinfo()].
#' @param colset Currently not used (columns subsetting is not available).
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
#' labels1 <- getinfo(dsub, "label")
#'
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, "label")
#' all.equal(labels1, labels2)
#'
#' @rdname xgb.slice.DMatrix
#' @export
xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
  if (!inherits(object, "xgb.DMatrix")) {
    stop("object must be xgb.DMatrix")
  }
  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)

  attr_list <- attributes(object)
  nr <- nrow(object)
  len <- sapply(attr_list, NROW)
  ind <- which(len == nr)
  if (length(ind) > 0) {
    nms <- names(attr_list)[ind]
    for (i in seq_along(ind)) {
      obj_attr <- attr(object, nms[i])
      if (NCOL(obj_attr) > 1) {
        attr(ret, nms[i]) <- obj_attr[idxset, ]
      } else {
        attr(ret, nms[i]) <- obj_attr[idxset]
      }
    }
  }

  out <- structure(ret, class = "xgb.DMatrix")
  parent_fields <- as.list(attributes(object)$fields)
  if (NROW(parent_fields)) {
    child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
    child_fields <- as.environment(child_fields)
    attributes(out)$fields <- child_fields
  }
  return(out)
}

#' @rdname xgb.slice.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
  xgb.slice.DMatrix(object, idxset)
}


#' Print xgb.DMatrix
#'
#' Print information about xgb.DMatrix.
#' Currently it displays dimensions and presence of info-fields and colnames.
#'
#' @param x An xgb.DMatrix object.
#' @param verbose Whether to print colnames (when present).
#' @param ... Not currently used.
#'
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#' dtrain
#'
#' print(dtrain, verbose = TRUE)
#'
#' @method print xgb.DMatrix
#' @export
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
  if (.Call(XGCheckNullPtr_R, x)) {
    cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
    return(invisible(x))
  }
  class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
    "xgb.QuantileDMatrix"
  } else if (inherits(x, "xgb.ExtMemDMatrix")) {
    "xgb.ExtMemDMatrix"
  } else if (inherits(x, "xgb.ProxyDMatrix")) {
    "xgb.ProxyDMatrix"
  } else {
    "xgb.DMatrix"
  }

  cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
  infos <- names(attributes(x)$fields)
  infos <- infos[infos != "feature_name"]
  if (!NROW(infos)) infos <- "NA"
  infos <- infos[order(infos)]
  infos <- paste(infos, collapse = ", ")
  cat(infos)
  cnames <- colnames(x)
  cat('  colnames:')
  if (verbose && !is.null(cnames)) {
    cat("\n'")
    cat(cnames, sep = "','")
    cat("'")
  } else {
    if (is.null(cnames)) cat(' no')
    else cat(' yes')
  }
  cat("\n")
  invisible(x)
}