Add data split mode to DMatrix MetaInfo (#8568)

This commit is contained in:
Rong Ou
2022-12-25 04:37:37 -08:00
committed by GitHub
parent 77b069c25d
commit 3ceeb8c61c
20 changed files with 113 additions and 103 deletions

View File

@@ -126,12 +126,28 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
/*!
* \brief load a data matrix
* \deprecated since 2.0.0
* \see XGDMatrixCreateFromURI()
* \param fname the name of the file
* \param silent whether print messages during loading
* \param out a loaded data matrix
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle *out);
/*!
* \brief load a data matrix
* \param config JSON encoded parameters for DMatrix construction. Accepted fields are:
* - uri: The URI of the input file.
* - silent (optional): Whether to print message during loading. Default to true.
* - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
* file is split accordingly; otherwise this is only an indicator on how the file was split
* beforehand. Default to row.
* \param out a loaded data matrix
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
/**
* @example c-api-demo.c
*/

View File

@@ -40,9 +40,7 @@ enum class DataType : uint8_t {
enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
enum class DataSplitMode : int {
kAuto = 0, kCol = 1, kRow = 2, kNone = 3
};
enum class DataSplitMode : int { kRow = 0, kCol = 1 };
/*!
* \brief Meta information about dataset, always sit in memory.
@@ -60,6 +58,8 @@ class MetaInfo {
uint64_t num_nonzero_{0}; // NOLINT
/*! \brief label of each instance */
linalg::Tensor<float, 2> labels;
/*! \brief data split mode */
DataSplitMode data_split_mode{DataSplitMode::kRow};
/*!
* \brief the index of begin and end of a group
* needed when the learning task is ranking.
@@ -544,15 +544,16 @@ class DMatrix {
* \brief Load DMatrix from URI.
* \param uri The URI of input.
* \param silent Whether print information during loading.
* \param data_split_mode Mode to read in part of the data, divided among the workers in distributed mode.
* \param data_split_mode In distributed mode, split the input according this mode; otherwise,
* it's just an indicator on how the input was split beforehand.
* \param file_format The format type of the file, used for dmlc::Parser::Create.
* By default "auto" will be able to load in both local binary file.
* \param page_size Page size for external memory.
* \return The created DMatrix.
*/
static DMatrix* Load(const std::string& uri,
bool silent,
DataSplitMode data_split_mode,
bool silent = true,
DataSplitMode data_split_mode = DataSplitMode::kRow,
const std::string& file_format = "auto");
/**