From 1f9a57d17b4d351de7ae14aa23b13e4d490ed7d0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 28 Apr 2023 19:45:15 +0800 Subject: [PATCH] [Breaking] Require format to be specified in input URI. (#9077) Previously, we use `libsvm` as default when format is not specified. However, the dmlc data parser is not particularly robust against errors, and the most common type of error is undefined format. Along with which, we will recommend users to use other data loader instead. We will continue the maintenance of the parsers as it's currently used for many internal tests including federated learning. --- R-package/tests/testthat/test_dmatrix.R | 2 +- demo/CLI/binary_classification/mushroom.conf | 6 +- demo/CLI/regression/machine.conf | 6 +- demo/c-api/basic/c-api-demo.c | 4 +- demo/guide-python/boost_from_prediction.py | 16 +++-- demo/guide-python/cross_validation.py | 62 +++++++++++------ demo/guide-python/evals_result.py | 35 ++++++---- demo/guide-python/generalized_linear_model.py | 26 +++++-- demo/guide-python/predict_first_ntree.py | 4 +- demo/guide-python/predict_leaf_indices.py | 14 ++-- doc/tutorials/external_memory.rst | 6 +- doc/tutorials/input_format.rst | 7 +- include/xgboost/c_api.h | 6 +- include/xgboost/data.h | 12 ++-- .../java/example/BasicWalkThrough.java | 9 +-- .../java/example/BoostFromPrediction.java | 4 +- .../java/example/CrossValidation.java | 2 +- .../java/example/CustomObjective.java | 4 +- .../xgboost4j/java/example/EarlyStopping.java | 4 +- .../java/example/ExternalMemory.java | 4 +- .../java/example/GeneralizedLinearModel.java | 4 +- .../java/example/PredictFirstNtree.java | 4 +- .../java/example/PredictLeafIndices.java | 4 +- .../scala/example/BasicWalkThrough.scala | 8 +-- .../scala/example/BoostFromPrediction.scala | 4 +- .../scala/example/CrossValidation.scala | 2 +- .../scala/example/CustomObjective.scala | 4 +- .../scala/example/ExternalMemory.scala | 4 +- .../example/GeneralizedLinearModel.scala | 4 +- .../scala/example/PredictFirstNTree.scala | 4 +- .../scala/example/PredictLeafIndices.scala | 4 +- .../dmlc/xgboost4j/java/BoosterImplTest.java | 4 +- .../ml/dmlc/xgboost4j/java/DMatrixTest.java | 4 +- .../dmlc/xgboost4j/scala/DMatrixSuite.scala | 2 +- .../scala/ScalaBoosterImplSuite.scala | 40 +++++------ python-package/xgboost/testing/__init__.py | 7 ++ src/data/data.cc | 68 ++++++------------- src/data/file_iterator.h | 51 ++++++++++---- tests/cpp/common/test_hist_util.h | 3 +- tests/cpp/data/test_file_iterator.cc | 8 +-- tests/cpp/data/test_metainfo.cc | 5 +- tests/cpp/data/test_simple_dmatrix.cc | 12 ++-- tests/cpp/data/test_sparse_page_dmatrix.cc | 17 +++-- tests/cpp/data/test_sparse_page_dmatrix.cu | 2 +- tests/cpp/helpers.cc | 2 +- tests/cpp/test_learner.cc | 3 +- tests/python/test_basic.py | 18 ++--- tests/python/test_basic_models.py | 24 +++---- tests/python/test_callback.py | 8 +-- tests/python/test_dmatrix.py | 6 +- tests/python/test_interaction_constraints.py | 8 ++- tests/python/test_monotone_constraints.py | 4 +- tests/python/test_openmp.py | 4 +- tests/python/test_parse_tree.py | 2 +- tests/python/test_plotting.py | 4 +- tests/python/test_shap.py | 4 +- tests/python/test_updaters.py | 4 +- tests/python/test_with_pandas.py | 2 +- 58 files changed, 327 insertions(+), 268 deletions(-) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 1d8cb0f23..21d39f255 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -72,7 +72,7 @@ test_that("xgb.DMatrix: saving, loading", { tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1") tmp_file <- tempfile(fileext = ".libsvm") writeLines(tmp, tmp_file) - dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE) + dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE) expect_equal(dim(dtest4), c(3, 4)) expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0)) diff --git a/demo/CLI/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf index 3cf865465..d78199cd7 100644 --- a/demo/CLI/binary_classification/mushroom.conf +++ b/demo/CLI/binary_classification/mushroom.conf @@ -20,10 +20,10 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 2 # The path of training data -data = "agaricus.txt.train" +data = "agaricus.txt.train?format=libsvm" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "agaricus.txt.test" +eval[test] = "agaricus.txt.test?format=libsvm" # evaluate on training data as well each round eval_train = 1 # The path of test data -test:data = "agaricus.txt.test" +test:data = "agaricus.txt.test?format=libsvm" diff --git a/demo/CLI/regression/machine.conf b/demo/CLI/regression/machine.conf index 4ba8437d5..42e2b1227 100644 --- a/demo/CLI/regression/machine.conf +++ b/demo/CLI/regression/machine.conf @@ -21,8 +21,8 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 0 # The path of training data -data = "machine.txt.train" +data = "machine.txt.train?format=libsvm" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "machine.txt.test" +eval[test] = "machine.txt.test?format=libsvm" # The path of test data -test:data = "machine.txt.test" +test:data = "machine.txt.test?format=libsvm" diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c index ca6e689aa..15a224e9e 100644 --- a/demo/c-api/basic/c-api-demo.c +++ b/demo/c-api/basic/c-api-demo.c @@ -42,8 +42,8 @@ int main() { // load the data DMatrixHandle dtrain, dtest; - safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain)); - safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest)); + safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain)); + safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest)); // create the booster BoosterHandle booster; diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py index 53a45549a..13f91d7c8 100644 --- a/demo/guide-python/boost_from_prediction.py +++ b/demo/guide-python/boost_from_prediction.py @@ -7,15 +7,19 @@ import os import xgboost as xgb CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) +watchlist = [(dtest, "eval"), (dtrain, "train")] ### # advanced: start from a initial base prediction # -print('start running example to start from a initial prediction') +print("start running example to start from a initial prediction") # specify parameters via map, definition are same as c++ version -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # train xgboost for 1 round bst = xgb.train(param, dtrain, 1, watchlist) # Note: we need the margin value instead of transformed prediction in @@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True) dtrain.set_base_margin(ptrain) dtest.set_base_margin(ptest) -print('this is result of running from initial prediction') +print("this is result of running from initial prediction") bst = xgb.train(param, dtrain, 1, watchlist) diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py index 2565b02c9..4e537108a 100644 --- a/demo/guide-python/cross_validation.py +++ b/demo/guide-python/cross_validation.py @@ -10,27 +10,45 @@ import xgboost as xgb # load data in do training CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'} +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} num_round = 2 -print('running cross validation') +print("running cross validation") # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'error'}, seed=0, - callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)]) +xgb.cv( + param, + dtrain, + num_round, + nfold=5, + metrics={"error"}, + seed=0, + callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)], +) -print('running cross validation, disable standard deviation display') +print("running cross validation, disable standard deviation display") # do cross validation, this will print result out as # [iteration] metric_name:mean_value -res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5, - metrics={'error'}, seed=0, - callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False), - xgb.callback.EarlyStopping(3)]) +res = xgb.cv( + param, + dtrain, + num_boost_round=10, + nfold=5, + metrics={"error"}, + seed=0, + callbacks=[ + xgb.callback.EvaluationMonitor(show_stdv=False), + xgb.callback.EarlyStopping(3), + ], +) print(res) -print('running cross validation, with preprocessing function') +print("running cross validation, with preprocessing function") + + # define the preprocessing function # used to return the preprocessed training, test data, and parameter # we can use this to do weight rescale, etc. @@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function') def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label == 1) - param['scale_pos_weight'] = ratio + param["scale_pos_weight"] = ratio return (dtrain, dtest, param) + # do cross validation, for each fold # the dtrain, dtest, param will be passed into fpreproc # then the return value of fpreproc will be used to generate # results of that fold -xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'auc'}, seed=0, fpreproc=fpreproc) +xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc) ### # you can also do cross validation with customized loss function # See custom_objective.py ## -print('running cross validation, with customized loss function') +print("running cross validation, with customized loss function") + + def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0 - preds) return grad, hess + + def evalerror(preds, dtrain): labels = dtrain.get_label() - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + return "error", float(sum(labels != (preds > 0.0))) / len(labels) -param = {'max_depth':2, 'eta':1} + +param = {"max_depth": 2, "eta": 1} # train with customized objective -xgb.cv(param, dtrain, num_round, nfold=5, seed=0, - obj=logregobj, feval=evalerror) +xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror) diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py index bba8862f5..7b9da96da 100644 --- a/demo/guide-python/evals_result.py +++ b/demo/guide-python/evals_result.py @@ -7,28 +7,37 @@ import os import xgboost as xgb CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) -param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')] +param = [ + ("max_depth", 2), + ("objective", "binary:logistic"), + ("eval_metric", "logloss"), + ("eval_metric", "error"), +] num_round = 2 -watchlist = [(dtest,'eval'), (dtrain,'train')] +watchlist = [(dtest, "eval"), (dtrain, "train")] evals_result = {} bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result) -print('Access logloss metric directly from evals_result:') -print(evals_result['eval']['logloss']) +print("Access logloss metric directly from evals_result:") +print(evals_result["eval"]["logloss"]) -print('') -print('Access metrics through a loop:') +print("") +print("Access metrics through a loop:") for e_name, e_mtrs in evals_result.items(): - print('- {}'.format(e_name)) + print("- {}".format(e_name)) for e_mtr_name, e_mtr_vals in e_mtrs.items(): - print(' - {}'.format(e_mtr_name)) - print(' - {}'.format(e_mtr_vals)) + print(" - {}".format(e_mtr_name)) + print(" - {}".format(e_mtr_vals)) -print('') -print('Access complete dictionary:') +print("") +print("Access complete dictionary:") print(evals_result) diff --git a/demo/guide-python/generalized_linear_model.py b/demo/guide-python/generalized_linear_model.py index 976428f13..3387b1982 100644 --- a/demo/guide-python/generalized_linear_model.py +++ b/demo/guide-python/generalized_linear_model.py @@ -11,14 +11,22 @@ import xgboost as xgb # basically, we are using linear model, instead of tree for our boosters ## CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) # change booster to gblinear, so that we are fitting a linear model # alpha is the L1 regularizer # lambda is the L2 regularizer # you can also set lambda_bias which is L2 regularizer on the bias term -param = {'objective':'binary:logistic', 'booster':'gblinear', - 'alpha': 0.0001, 'lambda': 1} +param = { + "objective": "binary:logistic", + "booster": "gblinear", + "alpha": 0.0001, + "lambda": 1, +} # normally, you do not need to set eta (step_size) # XGBoost uses a parallel coordinate descent algorithm (shotgun), @@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear', ## # the rest of settings are the same ## -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest) labels = dtest.get_label() -print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)))) +print( + "error=%f" + % ( + sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) + / float(len(preds)) + ) +) diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py index 55f7c61af..78137b4e1 100644 --- a/demo/guide-python/predict_first_ntree.py +++ b/demo/guide-python/predict_first_ntree.py @@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test") def native_interface(): # load data in do training - dtrain = xgb.DMatrix(train) - dtest = xgb.DMatrix(test) + dtrain = xgb.DMatrix(train + "?format=libsvm") + dtest = xgb.DMatrix(test + "?format=libsvm") param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 3 diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py index 45cc8fa7f..627619724 100644 --- a/demo/guide-python/predict_leaf_indices.py +++ b/demo/guide-python/predict_leaf_indices.py @@ -8,14 +8,18 @@ import xgboost as xgb # load data in do training CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} +watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 3 bst = xgb.train(param, dtrain, num_round, watchlist) -print('start testing predict the leaf indices') +print("start testing predict the leaf indices") # predict using first 2 tree leafindex = bst.predict( dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst index 3b96cfe92..006d63b43 100644 --- a/doc/tutorials/external_memory.rst +++ b/doc/tutorials/external_memory.rst @@ -77,7 +77,7 @@ The external memory version takes in the following `URI `_ for a description of the CSV format.). Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV. Instead it employs `URI `_ format for specifying the precise input file type. For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error. Instead, users need to provide an URI in the form of ``train.csv?format=csv``. For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``. See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also. + +XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article `_ for a description of the CSV format.). Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV. Instead it employs `URI `_ format for specifying the precise input file type. For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error. Instead, users need to provide an URI in the form of ``train.csv?format=csv`` or ``train.csv?format=libsvm``. For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``. See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also. For training or predicting, XGBoost takes an instance file with the format as below: diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index e56680780..4b9d37335 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle /*! * \brief load a data matrix * \param config JSON encoded parameters for DMatrix construction. Accepted fields are: - * - uri: The URI of the input file. + + * - uri: The URI of the input file. The URI parameter `format` is required when loading text data. + * \verbatim embed:rst:leading-asterisk + * See :doc:`/tutorials/input_format` for more info. + * \endverbatim * - silent (optional): Whether to print message during loading. Default to true. * - data_split_mode (optional): Whether to split by row or column. In distributed mode, the * file is split accordingly; otherwise this is only an indicator on how the file was split diff --git a/include/xgboost/data.h b/include/xgboost/data.h index fe22fb2b5..3f7b6ad85 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -566,21 +566,17 @@ class DMatrix { return Info().num_nonzero_ == Info().num_row_ * Info().num_col_; } - /*! + /** * \brief Load DMatrix from URI. + * * \param uri The URI of input. * \param silent Whether print information during loading. * \param data_split_mode In distributed mode, split the input according this mode; otherwise, * it's just an indicator on how the input was split beforehand. - * \param file_format The format type of the file, used for dmlc::Parser::Create. - * By default "auto" will be able to load in both local binary file. - * \param page_size Page size for external memory. * \return The created DMatrix. */ - static DMatrix* Load(const std::string& uri, - bool silent = true, - DataSplitMode data_split_mode = DataSplitMode::kRow, - const std::string& file_format = "auto"); + static DMatrix* Load(const std::string& uri, bool silent = true, + DataSplitMode data_split_mode = DataSplitMode::kRow); /** * \brief Creates a new DMatrix from an external data adapter. diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java index 7e4fe6806..8a74b74da 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2021 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -62,8 +62,8 @@ public class BasicWalkThrough { public static void main(String[] args) throws IOException, XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); HashMap params = new HashMap(); params.put("eta", 1.0); @@ -112,7 +112,8 @@ public class BasicWalkThrough { System.out.println("start build dmatrix from csr sparse data ..."); //build dmatrix from CSR Sparse Matrix - DataLoader.CSRSparseData spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); + DataLoader.CSRSparseData spData = + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, DMatrix.SparseType.CSR, 127); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java index 7eb9e99f0..fe5db0465 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java @@ -32,8 +32,8 @@ public class BoostFromPrediction { System.out.println("start running example to start from a initial prediction"); // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java index dbe5f368c..3577be226 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java @@ -30,7 +30,7 @@ import ml.dmlc.xgboost4j.java.XGBoostError; public class CrossValidation { public static void main(String[] args) throws IOException, XGBoostError { //load train mat - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); //set params HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java index 6d529974c..c631dc01a 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java @@ -139,9 +139,9 @@ public class CustomObjective { public static void main(String[] args) throws XGBoostError { //load train mat (svmlight format) - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); //load valid mat (svmlight format) - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); HashMap params = new HashMap(); params.put("eta", 1.0); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java index 61e752f85..9e52c12fd 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java @@ -29,9 +29,9 @@ import ml.dmlc.xgboost4j.java.example.util.DataLoader; public class EarlyStopping { public static void main(String[] args) throws IOException, XGBoostError { DataLoader.CSRSparseData trainCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); DataLoader.CSRSparseData testCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm"); Map paramMap = new HashMap() { { diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java index 349098ae1..70b2b85b5 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java @@ -32,8 +32,8 @@ public class ExternalMemory { //this is the only difference, add a # followed by a cache prefix name //several cache file with the prefix will be generated //currently only support convert from libsvm file - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java index 422cdea6a..09cc91c7f 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java @@ -32,8 +32,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval; public class GeneralizedLinearModel { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters //change booster to gblinear, so that we are fitting a linear model diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java index c98534a93..9038502bd 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java @@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval; public class PredictFirstNtree { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java index 0fcfb39de..7b1dfcb28 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java @@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.XGBoostError; public class PredictLeafIndices { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala index e8481b047..1893288b4 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,8 +36,8 @@ object BasicWalkThrough { } def main(args: Array[String]): Unit = { - val trainMax = new DMatrix("../../demo/data/agaricus.txt.train") - val testMax = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 @@ -76,7 +76,7 @@ object BasicWalkThrough { // build dmatrix from CSR Sparse Matrix println("start build dmatrix from csr sparse data ...") - val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train") + val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm") val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, JDMatrix.SparseType.CSR) trainMax2.setLabel(spData.labels) diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala index b894532fa..09b72fc50 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala @@ -24,8 +24,8 @@ object BoostFromPrediction { def main(args: Array[String]): Unit = { println("start running example to start from a initial prediction") - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala index 62f8b461a..6083209ec 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala @@ -21,7 +21,7 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object CrossValidation { def main(args: Array[String]): Unit = { - val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") // set params val params = new mutable.HashMap[String, Any] diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala index fe88423e7..8cc49c90d 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala @@ -138,8 +138,8 @@ object CustomObjective { } def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 params += "max_depth" -> 2 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala index 447c98295..c7f3d8bbb 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala @@ -25,8 +25,8 @@ object ExternalMemory { // this is the only difference, add a # followed by a cache prefix name // several cache file with the prefix will be generated // currently only support convert from libsvm file - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala index 27ed98eca..e370010b6 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala @@ -27,8 +27,8 @@ import ml.dmlc.xgboost4j.scala.example.util.CustomEval */ object GeneralizedLinearModel { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") // specify parameters // change booster to gblinear, so that we are fitting a linear model diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala index 5395e3638..40a5ffc44 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala @@ -23,8 +23,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object PredictFirstNTree { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala index f40a8aac6..7ae2e6520 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala @@ -25,8 +25,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object PredictLeafIndices { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java index cce1254d0..20a243f5b 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java @@ -30,8 +30,8 @@ import org.junit.Test; * @author hzx */ public class BoosterImplTest { - private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1"; - private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1"; + private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm"; + private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1&format=libsvm"; public static class EvalError implements IEvaluation { @Override diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java index cf174c6dd..d658c5529 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -88,7 +88,7 @@ public class DMatrixTest { public void testCreateFromFile() throws XGBoostError { //create DMatrix from file String filePath = writeResourceIntoTempFile("/agaricus.txt.test"); - DMatrix dmat = new DMatrix(filePath); + DMatrix dmat = new DMatrix(filePath + "?format=libsvm"); //get label float[] labels = dmat.getLabel(); //check length diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala index 05c6856f7..53325effa 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala @@ -25,7 +25,7 @@ import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix} class DMatrixSuite extends AnyFunSuite { test("create DMatrix from File") { - val dmat = new DMatrix("../../demo/data/agaricus.txt.test") + val dmat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") // get label val labels: Array[Float] = dmat.getLabel // check length diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala index 8cac9fe4f..2eda1fa2d 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala @@ -95,8 +95,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("basic operation of booster") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val booster = trainBooster(trainMat, testMat) val predicts = booster.predict(testMat, true) @@ -106,8 +106,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { test("save/load model with path") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val eval = new EvalError val booster = trainBooster(trainMat, testMat) // save and load @@ -123,8 +123,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("save/load model with stream") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val eval = new EvalError val booster = trainBooster(trainMat, testMat) // save and load @@ -139,7 +139,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("cross validation") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") val params = List("eta" -> "1.0", "max_depth" -> "3", "silent" -> "1", "nthread" -> "6", "objective" -> "binary:logistic", "gamma" -> "1.0", "eval_metric" -> "error").toMap val round = 2 @@ -148,8 +148,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwise") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap @@ -158,8 +158,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo lossguide") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap @@ -168,8 +168,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo lossguide with max bin") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16", @@ -179,8 +179,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwidth with max depth") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2", @@ -190,8 +190,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwidth with max depth and max bin") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2", @@ -201,7 +201,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test training from existing model in scala") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2", @@ -213,8 +213,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test getting number of features from a booster") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val booster = trainBooster(trainMat, testMat) TestCase.assertEquals(booster.getNumFeature, 127) diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 5566e0b2d..026381fe1 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -882,5 +882,12 @@ def data_dir(path: str) -> str: return os.path.join(demo_dir(path), "data") +def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]: + dpath = data_dir(path) + dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train?format=libsvm")) + dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test?format=libsvm")) + return dtrain, dtest + + def project_root(path: str) -> str: return normpath(os.path.join(demo_dir(path), os.path.pardir)) diff --git a/src/data/data.cc b/src/data/data.cc index 236bd9131..1aedd6d92 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -819,8 +819,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) { return nullptr; } -DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode, - const std::string& file_format) { +DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) { auto need_split = false; if (collective::IsFederated()) { LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers"; @@ -862,11 +861,9 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s } // legacy handling of binary data loading - if (file_format == "auto") { - DMatrix* loaded = TryLoadBinary(fname, silent); - if (loaded) { - return loaded; - } + DMatrix* loaded = TryLoadBinary(fname, silent); + if (loaded) { + return loaded; } int partid = 0, npart = 1; @@ -882,47 +879,24 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts"; } + data::ValidateFileFormat(fname); DMatrix* dmat {nullptr}; - try { - if (cache_file.empty()) { - std::unique_ptr> parser( - dmlc::Parser::Create(fname.c_str(), partid, npart, file_format.c_str())); - data::FileAdapter adapter(parser.get()); - dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), Context{}.Threads(), - cache_file, data_split_mode); - } else { - data::FileIterator iter{fname, static_cast(partid), static_cast(npart), - file_format}; - dmat = new data::SparsePageDMatrix{&iter, - iter.Proxy(), - data::fileiter::Reset, - data::fileiter::Next, - std::numeric_limits::quiet_NaN(), - 1, - cache_file}; - } - } catch (dmlc::Error& e) { - std::vector splited = common::Split(fname, '#'); - std::vector args = common::Split(splited.front(), '?'); - std::string format {file_format}; - if (args.size() == 1 && file_format == "auto") { - auto extension = common::Split(args.front(), '.').back(); - if (extension == "csv" || extension == "libsvm") { - format = extension; - } - if (format == extension) { - LOG(WARNING) - << "No format parameter is provided in input uri, but found file extension: " - << format << " . " - << "Consider providing a uri parameter: filename?format=" << format; - } else { - LOG(WARNING) - << "No format parameter is provided in input uri. " - << "Choosing default parser in dmlc-core. " - << "Consider providing a uri parameter like: filename?format=csv"; - } - } - LOG(FATAL) << "Encountered parser error:\n" << e.what(); + + if (cache_file.empty()) { + std::unique_ptr> parser( + dmlc::Parser::Create(fname.c_str(), partid, npart, "auto")); + data::FileAdapter adapter(parser.get()); + dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), Context{}.Threads(), + cache_file, data_split_mode); + } else { + data::FileIterator iter{fname, static_cast(partid), static_cast(npart)}; + dmat = new data::SparsePageDMatrix{&iter, + iter.Proxy(), + data::fileiter::Reset, + data::fileiter::Next, + std::numeric_limits::quiet_NaN(), + 1, + cache_file}; } if (need_split && data_split_mode == DataSplitMode::kCol) { diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h index 96f0e09d4..4d7239677 100644 --- a/src/data/file_iterator.h +++ b/src/data/file_iterator.h @@ -1,22 +1,50 @@ -/*! - * Copyright 2021 XGBoost contributors +/** + * Copyright 2021-2023, XGBoost contributors */ #ifndef XGBOOST_DATA_FILE_ITERATOR_H_ #define XGBOOST_DATA_FILE_ITERATOR_H_ -#include +#include #include -#include +#include #include +#include +#include "array_interface.h" #include "dmlc/data.h" #include "xgboost/c_api.h" #include "xgboost/json.h" #include "xgboost/linalg.h" -#include "array_interface.h" namespace xgboost { namespace data { +inline void ValidateFileFormat(std::string const& uri) { + std::vector name_cache = common::Split(uri, '#'); + CHECK_LE(name_cache.size(), 2) + << "Only one `#` is allowed in file path for cachefile specification"; + + std::vector name_args = common::Split(name_cache[0], '?'); + CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path."; + + StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"}; + CHECK_EQ(name_args.size(), 2) << msg; + + std::map args; + std::vector arg_list = common::Split(name_args[1], '&'); + for (size_t i = 0; i < arg_list.size(); ++i) { + std::istringstream is(arg_list[i]); + std::pair kv; + CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format" + << " for key in arg " << i + 1; + CHECK(std::getline(is, kv.second)) << "Invalid uri argument format" + << " for value in arg " << i + 1; + args.insert(kv); + } + if (args.find("format") == args.cend()) { + LOG(FATAL) << msg; + } +} + /** * An iterator for implementing external memory support with file inputs. Users of * external memory are encouraged to define their own file parsers/loaders so this one is @@ -31,8 +59,6 @@ class FileIterator { uint32_t part_idx_; // Equals to total number of workers. uint32_t n_parts_; - // Format of the input file, like "libsvm". - std::string type_; DMatrixHandle proxy_; @@ -45,10 +71,9 @@ class FileIterator { std::string indices_; public: - FileIterator(std::string uri, unsigned part_index, unsigned num_parts, - std::string type) - : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts}, - type_{std::move(type)} { + FileIterator(std::string uri, unsigned part_index, unsigned num_parts) + : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} { + ValidateFileFormat(uri_); XGProxyDMatrixCreate(&proxy_); } ~FileIterator() { @@ -94,9 +119,7 @@ class FileIterator { auto Proxy() -> decltype(proxy_) { return proxy_; } void Reset() { - CHECK(!type_.empty()); - parser_.reset(dmlc::Parser::Create(uri_.c_str(), part_idx_, - n_parts_, type_.c_str())); + parser_.reset(dmlc::Parser::Create(uri_.c_str(), part_idx_, n_parts_, "auto")); } }; diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index ccfdbff52..b8de641ff 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -88,7 +88,8 @@ inline std::shared_ptr GetExternalMemoryDMatrixFromData( fo << row_data.str() << "\n"; } fo.close(); - return std::shared_ptr(DMatrix::Load(tmp_file + "#" + tmp_file + ".cache")); + return std::shared_ptr( + DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache")); } // Test that elements are approximately equally distributed among bins diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc index 31da2c1fa..bd8c4b9c2 100644 --- a/tests/cpp/data/test_file_iterator.cc +++ b/tests/cpp/data/test_file_iterator.cc @@ -29,16 +29,16 @@ TEST(FileIterator, Basic) { { auto zpath = tmpdir.path + "/0-based.svm"; CreateBigTestData(zpath, 3 * 64, true); - zpath += "?indexing_mode=0"; - FileIterator iter{zpath, 0, 1, "libsvm"}; + zpath += "?indexing_mode=0&format=libsvm"; + FileIterator iter{zpath, 0, 1}; check_n_features(&iter); } { auto opath = tmpdir.path + "/1-based.svm"; CreateBigTestData(opath, 3 * 64, false); - opath += "?indexing_mode=1"; - FileIterator iter{opath, 0, 1, "libsvm"}; + opath += "?indexing_mode=1&format=libsvm"; + FileIterator iter{opath, 0, 1}; check_n_features(&iter); } } diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index 895844180..5ebe1c6bd 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) { dmlc::TemporaryDirectory tempdir; std::string tmp_file = tempdir.path + "/qid_test.libsvm"; { - std::unique_ptr fs( - dmlc::Stream::Create(tmp_file.c_str(), "w")); + std::unique_ptr fs(dmlc::Stream::Create(tmp_file.c_str(), "w")); dmlc::ostream os(fs.get()); os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0 2 qid:1 1:0 2:0 3:1 4:0.1 5:1 @@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) { os.set_stream(nullptr); } std::unique_ptr dmat( - xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm")); + xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow)); const xgboost::MetaInfo& info = dmat->Info(); const std::vector expected_group_ptr{0, 4, 8, 12}; diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index a37352626..3bdbf5403 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -17,11 +17,15 @@ using namespace xgboost; // NOLINT +namespace { +std::string UriSVM(std::string name) { return name + "?format=libsvm"; } +} // namespace + TEST(SimpleDMatrix, MetaInfo) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); // Test the metadata that was parsed EXPECT_EQ(dmat->Info().num_row_, 2); @@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false); // Loop over the batches and count the records int64_t row_count = 0; @@ -60,7 +64,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); ASSERT_TRUE(dmat->SingleColBlock()); @@ -387,7 +391,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); data::SimpleDMatrix *simple_dmat = dynamic_cast(dmat); const std::string tmp_binfile = tempdir.path + "/csr_source.binary"; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index 24dc40949..608c32947 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -16,14 +16,19 @@ #include "../helpers.h" using namespace xgboost; // NOLINT +namespace { +std::string UriSVM(std::string name, std::string cache) { + return name + "?format=libsvm" + "#" + cache + ".cache"; +} +} // namespace template void TestSparseDMatrixLoadFile() { dmlc::TemporaryDirectory tmpdir; auto opath = tmpdir.path + "/1-based.svm"; CreateBigTestData(opath, 3 * 64, false); - opath += "?indexing_mode=1"; - data::FileIterator iter{opath, 0, 1, "libsvm"}; + opath += "?indexing_mode=1&format=libsvm"; + data::FileIterator iter{opath, 0, 1}; auto n_threads = 0; data::SparsePageDMatrix m{&iter, iter.Proxy(), @@ -112,15 +117,13 @@ TEST(SparsePageDMatrix, MetaInfo) { size_t constexpr kEntries = 24; CreateBigTestData(tmp_file, kEntries); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false); + std::unique_ptr dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)}; // Test the metadata that was parsed EXPECT_EQ(dmat->Info().num_row_, 8ul); EXPECT_EQ(dmat->Info().num_col_, 5ul); EXPECT_EQ(dmat->Info().num_nonzero_, kEntries); EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_); - - delete dmat; } TEST(SparsePageDMatrix, RowAccess) { @@ -139,7 +142,7 @@ TEST(SparsePageDMatrix, ColAccess) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file)); // Loop over the batches and assert the data is as expected size_t iter = 0; @@ -231,7 +234,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) { std::string filename = tempdir.path + "/simple.libsvm"; CreateBigTestData(filename, 1 << 16); - data::FileIterator iter(filename, 0, 1, "auto"); + data::FileIterator iter(filename + "?format=libsvm", 0, 1); std::unique_ptr sparse{ new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, std::numeric_limits::quiet_NaN(), threads, filename}}; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index bb562ffb7..55a44e458 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -13,7 +13,7 @@ TEST(SparsePageDMatrix, EllpackPage) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"); + DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"); // Loop over the batches and assert the data is as expected size_t n = 0; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 76fd2f967..7c81b96f9 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -548,7 +548,7 @@ std::unique_ptr CreateSparsePageDMatrixWithRC( } fo.close(); - std::string uri = tmp_file; + std::string uri = tmp_file + "?format=libsvm"; if (page_size > 0) { uri += "#" + tmp_file + ".cache"; } diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 91e8070c2..a3bb30fcd 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -126,7 +126,8 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/big.libsvm"; CreateBigTestData(tmp_file, 50000); - std::shared_ptr dmat(xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache")); + std::shared_ptr dmat( + xgboost::DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache")); EXPECT_FALSE(dmat->SingleColBlock()); size_t num_row = dmat->Info().num_row_; std::vector labels(num_row); diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index fab2a6eca..e512e4bc6 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -21,8 +21,7 @@ class TestBasic: assert not lazy_isinstance(a, 'numpy', 'dataframe') def test_basic(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} # specify validations set to watch performance @@ -61,8 +60,7 @@ class TestBasic: def test_metric_config(self): # Make sure that the metric configuration happens in booster so the # string `['error', 'auc']` doesn't get passed down to core. - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']} watchlist = [(dtest, 'eval'), (dtrain, 'train')] @@ -78,8 +76,7 @@ class TestBasic: np.testing.assert_allclose(predt_0, predt_1) def test_multiclass(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2} # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] @@ -188,7 +185,7 @@ class TestBasic: assert dm.num_col() == cols def test_cv(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} @@ -198,7 +195,7 @@ class TestBasic: assert len(cv) == (4) def test_cv_no_shuffle(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} @@ -209,7 +206,7 @@ class TestBasic: assert len(cv) == (4) def test_cv_explicit_fold_indices(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} folds = [ @@ -268,8 +265,7 @@ class TestBasicPathLike: def test_DMatrix_init_from_path(self): """Initialization from the data path.""" - dpath = Path('demo/data') - dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) assert dtrain.num_row() == 6513 assert dtrain.num_col() == 127 diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index d76205593..610a9236e 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -42,8 +42,7 @@ class TestModels: param = {'verbosity': 0, 'objective': 'binary:logistic', 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, 'nthread': 1} - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) @@ -55,8 +54,7 @@ class TestModels: assert err < 0.2 def test_dart(self): - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1} # specify validations set to watch performance @@ -122,7 +120,7 @@ class TestModels: def test_boost_from_prediction(self): # Re-construct dtrain here to avoid modification - margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + margined, _ = tm.load_agaricus(__file__) bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_0 = bst.predict(margined, output_margin=True) margined.set_base_margin(predt_0) @@ -130,13 +128,13 @@ class TestModels: predt_1 = bst.predict(margined) assert np.any(np.abs(predt_1 - predt_0) > 1e-6) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) bst = xgb.train({'tree_method': 'hist'}, dtrain, 2) predt_2 = bst.predict(dtrain) assert np.all(np.abs(predt_2 - predt_1) < 1e-6) def test_boost_from_existing_model(self): - X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + X, _ = tm.load_agaricus(__file__) booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4) assert booster.num_boosted_rounds() == 4 booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4, @@ -156,8 +154,7 @@ class TestModels: 'objective': 'reg:logistic', "tree_method": tree_method } - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 10 @@ -203,8 +200,7 @@ class TestModels: self.run_custom_objective() def test_multi_eval_metric(self): - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1, 'objective': 'binary:logistic'} @@ -226,7 +222,7 @@ class TestModels: param['scale_pos_weight'] = ratio return (dtrain, dtest, param) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc) @@ -234,7 +230,7 @@ class TestModels: param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} num_round = 2 - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False) @@ -392,7 +388,7 @@ class TestModels: os.remove(model_path) try: - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1) except ValueError as e: e_str = str(e) diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index e8375aa5e..d3ec05e6e 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -275,9 +275,7 @@ class TestCallbacks: """Test learning rate scheduler, used by both CPU and GPU tests.""" scheduler = xgb.callback.LearningRateScheduler - dpath = tm.data_dir(__file__) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 @@ -361,9 +359,7 @@ class TestCallbacks: num_round = 4 scheduler = xgb.callback.LearningRateScheduler - dpath = tm.data_dir(__file__) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = { diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index ef56ff656..bcc089afb 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -283,7 +283,7 @@ class TestDMatrix: assert m0.feature_types == m1.feature_types def test_get_info(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) dtrain.get_float_info('label') dtrain.get_float_info('weight') dtrain.get_float_info('base_margin') @@ -432,7 +432,9 @@ class TestDMatrix: def test_uri_categorical(self): path = os.path.join(dpath, 'agaricus.txt.train') feature_types = ["q"] * 5 + ["c"] + ["q"] * 120 - Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types) + Xy = xgb.DMatrix( + path + "?indexing_mode=1&format=libsvm", feature_types=feature_types + ) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) def test_base_margin(self): diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py index 96d2ba7dc..5eaaf1f8c 100644 --- a/tests/python/test_interaction_constraints.py +++ b/tests/python/test_interaction_constraints.py @@ -88,8 +88,12 @@ class TestInteractionConstraints: def training_accuracy(self, tree_method): """Test accuracy, reused by GPU tests.""" from sklearn.metrics import accuracy_score - dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') - dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') + dtrain = xgboost.DMatrix( + dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm" + ) + dtest = xgboost.DMatrix( + dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm" + ) params = { 'eta': 1, 'max_depth': 6, diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py index 4dbfaa60d..a3785f1cb 100644 --- a/tests/python/test_monotone_constraints.py +++ b/tests/python/test_monotone_constraints.py @@ -134,8 +134,8 @@ class TestMonotoneConstraints: @pytest.mark.skipif(**tm.no_sklearn()) def test_training_accuracy(self): from sklearn.metrics import accuracy_score - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') + dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm") + dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm") params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': 'hist', 'monotone_constraints': '(1, 0)'} num_boost_round = 5 diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py index c53363736..82b0ba270 100644 --- a/tests/python/test_openmp.py +++ b/tests/python/test_openmp.py @@ -13,9 +13,7 @@ pytestmark = tm.timeout(10) class TestOMP: def test_omp(self): - dpath = 'demo/data/' - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'booster': 'gbtree', 'objective': 'binary:logistic', diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py index 885c0f1e2..9d80d0f6f 100644 --- a/tests/python/test_parse_tree.py +++ b/tests/python/test_parse_tree.py @@ -13,7 +13,7 @@ rng = np.random.RandomState(1994) class TestTreesToDataFrame: def build_model(self, max_depth, num_round): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) param = {'max_depth': max_depth, 'objective': 'binary:logistic', 'verbosity': 1} num_round = num_round diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py index dc45cd254..303c7c8c1 100644 --- a/tests/python/test_plotting.py +++ b/tests/python/test_plotting.py @@ -17,12 +17,10 @@ except ImportError: pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz())) -dpath = 'demo/data/agaricus.txt.train' - class TestPlotting: def test_plotting(self): - m = xgb.DMatrix(dpath) + m, _ = tm.load_agaricus(__file__) booster = xgb.train({'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}, m, num_boost_round=2) diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index 4d861ad6e..2585da088 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -46,8 +46,8 @@ class TestSHAP: fscores = bst.get_fscore() assert scores1 == fscores - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm') def fn(max_depth, num_rounds): # train diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index dd710f6a4..78097a4ea 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -154,9 +154,7 @@ class TestTreeMethod: def test_hist_categorical(self): # hist must be same as exact on all-categorial data - dpath = 'demo/data/' - ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + ag_dtrain, ag_dtest = tm.load_agaricus(__file__) ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 07295eb6c..f8a21b6ab 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -222,7 +222,7 @@ class TestPandas: set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist") def test_cv_as_pandas(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'error'}