diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 1d8cb0f23..21d39f255 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -72,7 +72,7 @@ test_that("xgb.DMatrix: saving, loading", { tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1") tmp_file <- tempfile(fileext = ".libsvm") writeLines(tmp, tmp_file) - dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE) + dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE) expect_equal(dim(dtest4), c(3, 4)) expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0)) diff --git a/demo/CLI/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf index 3cf865465..d78199cd7 100644 --- a/demo/CLI/binary_classification/mushroom.conf +++ b/demo/CLI/binary_classification/mushroom.conf @@ -20,10 +20,10 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 2 # The path of training data -data = "agaricus.txt.train" +data = "agaricus.txt.train?format=libsvm" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "agaricus.txt.test" +eval[test] = "agaricus.txt.test?format=libsvm" # evaluate on training data as well each round eval_train = 1 # The path of test data -test:data = "agaricus.txt.test" +test:data = "agaricus.txt.test?format=libsvm" diff --git a/demo/CLI/regression/machine.conf b/demo/CLI/regression/machine.conf index 4ba8437d5..42e2b1227 100644 --- a/demo/CLI/regression/machine.conf +++ b/demo/CLI/regression/machine.conf @@ -21,8 +21,8 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 0 # The path of training data -data = "machine.txt.train" +data = "machine.txt.train?format=libsvm" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "machine.txt.test" +eval[test] = "machine.txt.test?format=libsvm" # The path of test data -test:data = "machine.txt.test" +test:data = "machine.txt.test?format=libsvm" diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c index ca6e689aa..15a224e9e 100644 --- a/demo/c-api/basic/c-api-demo.c +++ b/demo/c-api/basic/c-api-demo.c @@ -42,8 +42,8 @@ int main() { // load the data DMatrixHandle dtrain, dtest; - safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain)); - safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest)); + safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain)); + safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest)); // create the booster BoosterHandle booster; diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py index 53a45549a..13f91d7c8 100644 --- a/demo/guide-python/boost_from_prediction.py +++ b/demo/guide-python/boost_from_prediction.py @@ -7,15 +7,19 @@ import os import xgboost as xgb CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) +watchlist = [(dtest, "eval"), (dtrain, "train")] ### # advanced: start from a initial base prediction # -print('start running example to start from a initial prediction') +print("start running example to start from a initial prediction") # specify parameters via map, definition are same as c++ version -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} # train xgboost for 1 round bst = xgb.train(param, dtrain, 1, watchlist) # Note: we need the margin value instead of transformed prediction in @@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True) dtrain.set_base_margin(ptrain) dtest.set_base_margin(ptest) -print('this is result of running from initial prediction') +print("this is result of running from initial prediction") bst = xgb.train(param, dtrain, 1, watchlist) diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py index 2565b02c9..4e537108a 100644 --- a/demo/guide-python/cross_validation.py +++ b/demo/guide-python/cross_validation.py @@ -10,27 +10,45 @@ import xgboost as xgb # load data in do training CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'} +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} num_round = 2 -print('running cross validation') +print("running cross validation") # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'error'}, seed=0, - callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)]) +xgb.cv( + param, + dtrain, + num_round, + nfold=5, + metrics={"error"}, + seed=0, + callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)], +) -print('running cross validation, disable standard deviation display') +print("running cross validation, disable standard deviation display") # do cross validation, this will print result out as # [iteration] metric_name:mean_value -res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5, - metrics={'error'}, seed=0, - callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False), - xgb.callback.EarlyStopping(3)]) +res = xgb.cv( + param, + dtrain, + num_boost_round=10, + nfold=5, + metrics={"error"}, + seed=0, + callbacks=[ + xgb.callback.EvaluationMonitor(show_stdv=False), + xgb.callback.EarlyStopping(3), + ], +) print(res) -print('running cross validation, with preprocessing function') +print("running cross validation, with preprocessing function") + + # define the preprocessing function # used to return the preprocessed training, test data, and parameter # we can use this to do weight rescale, etc. @@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function') def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label == 1) - param['scale_pos_weight'] = ratio + param["scale_pos_weight"] = ratio return (dtrain, dtest, param) + # do cross validation, for each fold # the dtrain, dtest, param will be passed into fpreproc # then the return value of fpreproc will be used to generate # results of that fold -xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'auc'}, seed=0, fpreproc=fpreproc) +xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc) ### # you can also do cross validation with customized loss function # See custom_objective.py ## -print('running cross validation, with customized loss function') +print("running cross validation, with customized loss function") + + def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0 - preds) return grad, hess + + def evalerror(preds, dtrain): labels = dtrain.get_label() - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + return "error", float(sum(labels != (preds > 0.0))) / len(labels) -param = {'max_depth':2, 'eta':1} + +param = {"max_depth": 2, "eta": 1} # train with customized objective -xgb.cv(param, dtrain, num_round, nfold=5, seed=0, - obj=logregobj, feval=evalerror) +xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror) diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py index bba8862f5..7b9da96da 100644 --- a/demo/guide-python/evals_result.py +++ b/demo/guide-python/evals_result.py @@ -7,28 +7,37 @@ import os import xgboost as xgb CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) -param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')] +param = [ + ("max_depth", 2), + ("objective", "binary:logistic"), + ("eval_metric", "logloss"), + ("eval_metric", "error"), +] num_round = 2 -watchlist = [(dtest,'eval'), (dtrain,'train')] +watchlist = [(dtest, "eval"), (dtrain, "train")] evals_result = {} bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result) -print('Access logloss metric directly from evals_result:') -print(evals_result['eval']['logloss']) +print("Access logloss metric directly from evals_result:") +print(evals_result["eval"]["logloss"]) -print('') -print('Access metrics through a loop:') +print("") +print("Access metrics through a loop:") for e_name, e_mtrs in evals_result.items(): - print('- {}'.format(e_name)) + print("- {}".format(e_name)) for e_mtr_name, e_mtr_vals in e_mtrs.items(): - print(' - {}'.format(e_mtr_name)) - print(' - {}'.format(e_mtr_vals)) + print(" - {}".format(e_mtr_name)) + print(" - {}".format(e_mtr_vals)) -print('') -print('Access complete dictionary:') +print("") +print("Access complete dictionary:") print(evals_result) diff --git a/demo/guide-python/generalized_linear_model.py b/demo/guide-python/generalized_linear_model.py index 976428f13..3387b1982 100644 --- a/demo/guide-python/generalized_linear_model.py +++ b/demo/guide-python/generalized_linear_model.py @@ -11,14 +11,22 @@ import xgboost as xgb # basically, we are using linear model, instead of tree for our boosters ## CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) # change booster to gblinear, so that we are fitting a linear model # alpha is the L1 regularizer # lambda is the L2 regularizer # you can also set lambda_bias which is L2 regularizer on the bias term -param = {'objective':'binary:logistic', 'booster':'gblinear', - 'alpha': 0.0001, 'lambda': 1} +param = { + "objective": "binary:logistic", + "booster": "gblinear", + "alpha": 0.0001, + "lambda": 1, +} # normally, you do not need to set eta (step_size) # XGBoost uses a parallel coordinate descent algorithm (shotgun), @@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear', ## # the rest of settings are the same ## -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest) labels = dtest.get_label() -print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)))) +print( + "error=%f" + % ( + sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) + / float(len(preds)) + ) +) diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py index 55f7c61af..78137b4e1 100644 --- a/demo/guide-python/predict_first_ntree.py +++ b/demo/guide-python/predict_first_ntree.py @@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test") def native_interface(): # load data in do training - dtrain = xgb.DMatrix(train) - dtest = xgb.DMatrix(test) + dtrain = xgb.DMatrix(train + "?format=libsvm") + dtest = xgb.DMatrix(test + "?format=libsvm") param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 3 diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py index 45cc8fa7f..627619724 100644 --- a/demo/guide-python/predict_leaf_indices.py +++ b/demo/guide-python/predict_leaf_indices.py @@ -8,14 +8,18 @@ import xgboost as xgb # load data in do training CURRENT_DIR = os.path.dirname(__file__) -dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train')) -dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test')) -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +dtrain = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm") +) +dtest = xgb.DMatrix( + os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm") +) +param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} +watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 3 bst = xgb.train(param, dtrain, num_round, watchlist) -print('start testing predict the leaf indices') +print("start testing predict the leaf indices") # predict using first 2 tree leafindex = bst.predict( dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst index 3b96cfe92..006d63b43 100644 --- a/doc/tutorials/external_memory.rst +++ b/doc/tutorials/external_memory.rst @@ -77,7 +77,7 @@ The external memory version takes in the following `URI `_ for a description of the CSV format.). Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV. Instead it employs `URI `_ format for specifying the precise input file type. For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error. Instead, users need to provide an URI in the form of ``train.csv?format=csv``. For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``. See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also. + +XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article `_ for a description of the CSV format.). Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV. Instead it employs `URI `_ format for specifying the precise input file type. For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error. Instead, users need to provide an URI in the form of ``train.csv?format=csv`` or ``train.csv?format=libsvm``. For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``. See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also. For training or predicting, XGBoost takes an instance file with the format as below: diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index e56680780..4b9d37335 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle /*! * \brief load a data matrix * \param config JSON encoded parameters for DMatrix construction. Accepted fields are: - * - uri: The URI of the input file. + + * - uri: The URI of the input file. The URI parameter `format` is required when loading text data. + * \verbatim embed:rst:leading-asterisk + * See :doc:`/tutorials/input_format` for more info. + * \endverbatim * - silent (optional): Whether to print message during loading. Default to true. * - data_split_mode (optional): Whether to split by row or column. In distributed mode, the * file is split accordingly; otherwise this is only an indicator on how the file was split diff --git a/include/xgboost/data.h b/include/xgboost/data.h index fe22fb2b5..3f7b6ad85 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -566,21 +566,17 @@ class DMatrix { return Info().num_nonzero_ == Info().num_row_ * Info().num_col_; } - /*! + /** * \brief Load DMatrix from URI. + * * \param uri The URI of input. * \param silent Whether print information during loading. * \param data_split_mode In distributed mode, split the input according this mode; otherwise, * it's just an indicator on how the input was split beforehand. - * \param file_format The format type of the file, used for dmlc::Parser::Create. - * By default "auto" will be able to load in both local binary file. - * \param page_size Page size for external memory. * \return The created DMatrix. */ - static DMatrix* Load(const std::string& uri, - bool silent = true, - DataSplitMode data_split_mode = DataSplitMode::kRow, - const std::string& file_format = "auto"); + static DMatrix* Load(const std::string& uri, bool silent = true, + DataSplitMode data_split_mode = DataSplitMode::kRow); /** * \brief Creates a new DMatrix from an external data adapter. diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java index 7e4fe6806..8a74b74da 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2021 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -62,8 +62,8 @@ public class BasicWalkThrough { public static void main(String[] args) throws IOException, XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); HashMap params = new HashMap(); params.put("eta", 1.0); @@ -112,7 +112,8 @@ public class BasicWalkThrough { System.out.println("start build dmatrix from csr sparse data ..."); //build dmatrix from CSR Sparse Matrix - DataLoader.CSRSparseData spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); + DataLoader.CSRSparseData spData = + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, DMatrix.SparseType.CSR, 127); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java index 7eb9e99f0..fe5db0465 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java @@ -32,8 +32,8 @@ public class BoostFromPrediction { System.out.println("start running example to start from a initial prediction"); // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java index dbe5f368c..3577be226 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java @@ -30,7 +30,7 @@ import ml.dmlc.xgboost4j.java.XGBoostError; public class CrossValidation { public static void main(String[] args) throws IOException, XGBoostError { //load train mat - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); //set params HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java index 6d529974c..c631dc01a 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java @@ -139,9 +139,9 @@ public class CustomObjective { public static void main(String[] args) throws XGBoostError { //load train mat (svmlight format) - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); //load valid mat (svmlight format) - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); HashMap params = new HashMap(); params.put("eta", 1.0); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java index 61e752f85..9e52c12fd 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java @@ -29,9 +29,9 @@ import ml.dmlc.xgboost4j.java.example.util.DataLoader; public class EarlyStopping { public static void main(String[] args) throws IOException, XGBoostError { DataLoader.CSRSparseData trainCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); DataLoader.CSRSparseData testCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm"); Map paramMap = new HashMap() { { diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java index 349098ae1..70b2b85b5 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java @@ -32,8 +32,8 @@ public class ExternalMemory { //this is the only difference, add a # followed by a cache prefix name //several cache file with the prefix will be generated //currently only support convert from libsvm file - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java index 422cdea6a..09cc91c7f 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java @@ -32,8 +32,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval; public class GeneralizedLinearModel { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters //change booster to gblinear, so that we are fitting a linear model diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java index c98534a93..9038502bd 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java @@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval; public class PredictFirstNtree { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java index 0fcfb39de..7b1dfcb28 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java @@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.XGBoostError; public class PredictLeafIndices { public static void main(String[] args) throws XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); + DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); + DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); //specify parameters HashMap params = new HashMap(); diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala index e8481b047..1893288b4 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,8 +36,8 @@ object BasicWalkThrough { } def main(args: Array[String]): Unit = { - val trainMax = new DMatrix("../../demo/data/agaricus.txt.train") - val testMax = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 @@ -76,7 +76,7 @@ object BasicWalkThrough { // build dmatrix from CSR Sparse Matrix println("start build dmatrix from csr sparse data ...") - val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train") + val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm") val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, JDMatrix.SparseType.CSR) trainMax2.setLabel(spData.labels) diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala index b894532fa..09b72fc50 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala @@ -24,8 +24,8 @@ object BoostFromPrediction { def main(args: Array[String]): Unit = { println("start running example to start from a initial prediction") - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala index 62f8b461a..6083209ec 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala @@ -21,7 +21,7 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object CrossValidation { def main(args: Array[String]): Unit = { - val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") // set params val params = new mutable.HashMap[String, Any] diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala index fe88423e7..8cc49c90d 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala @@ -138,8 +138,8 @@ object CustomObjective { } def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 params += "max_depth" -> 2 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala index 447c98295..c7f3d8bbb 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala @@ -25,8 +25,8 @@ object ExternalMemory { // this is the only difference, add a # followed by a cache prefix name // several cache file with the prefix will be generated // currently only support convert from libsvm file - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala index 27ed98eca..e370010b6 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala @@ -27,8 +27,8 @@ import ml.dmlc.xgboost4j.scala.example.util.CustomEval */ object GeneralizedLinearModel { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") // specify parameters // change booster to gblinear, so that we are fitting a linear model diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala index 5395e3638..40a5ffc44 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala @@ -23,8 +23,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object PredictFirstNTree { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala index f40a8aac6..7ae2e6520 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala @@ -25,8 +25,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix} object PredictLeafIndices { def main(args: Array[String]): Unit = { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java index cce1254d0..20a243f5b 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java @@ -30,8 +30,8 @@ import org.junit.Test; * @author hzx */ public class BoosterImplTest { - private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1"; - private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1"; + private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm"; + private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1&format=libsvm"; public static class EvalError implements IEvaluation { @Override diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java index cf174c6dd..d658c5529 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -88,7 +88,7 @@ public class DMatrixTest { public void testCreateFromFile() throws XGBoostError { //create DMatrix from file String filePath = writeResourceIntoTempFile("/agaricus.txt.test"); - DMatrix dmat = new DMatrix(filePath); + DMatrix dmat = new DMatrix(filePath + "?format=libsvm"); //get label float[] labels = dmat.getLabel(); //check length diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala index 05c6856f7..53325effa 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala @@ -25,7 +25,7 @@ import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix} class DMatrixSuite extends AnyFunSuite { test("create DMatrix from File") { - val dmat = new DMatrix("../../demo/data/agaricus.txt.test") + val dmat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") // get label val labels: Array[Float] = dmat.getLabel // check length diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala index 8cac9fe4f..2eda1fa2d 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala @@ -95,8 +95,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("basic operation of booster") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val booster = trainBooster(trainMat, testMat) val predicts = booster.predict(testMat, true) @@ -106,8 +106,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { test("save/load model with path") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val eval = new EvalError val booster = trainBooster(trainMat, testMat) // save and load @@ -123,8 +123,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("save/load model with stream") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val eval = new EvalError val booster = trainBooster(trainMat, testMat) // save and load @@ -139,7 +139,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("cross validation") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") val params = List("eta" -> "1.0", "max_depth" -> "3", "silent" -> "1", "nthread" -> "6", "objective" -> "binary:logistic", "gamma" -> "1.0", "eval_metric" -> "error").toMap val round = 2 @@ -148,8 +148,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwise") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap @@ -158,8 +158,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo lossguide") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap @@ -168,8 +168,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo lossguide with max bin") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "3", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16", @@ -179,8 +179,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwidth with max depth") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2", @@ -190,8 +190,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test with quantile histo depthwidth with max depth and max bin") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2", @@ -201,7 +201,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test training from existing model in scala") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") val paramMap = List("max_depth" -> "0", "silent" -> "0", "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2", @@ -213,8 +213,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite { } test("test getting number of features from a booster") { - val trainMat = new DMatrix("../../demo/data/agaricus.txt.train") - val testMat = new DMatrix("../../demo/data/agaricus.txt.test") + val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") + val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") val booster = trainBooster(trainMat, testMat) TestCase.assertEquals(booster.getNumFeature, 127) diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 5566e0b2d..026381fe1 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -882,5 +882,12 @@ def data_dir(path: str) -> str: return os.path.join(demo_dir(path), "data") +def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]: + dpath = data_dir(path) + dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train?format=libsvm")) + dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test?format=libsvm")) + return dtrain, dtest + + def project_root(path: str) -> str: return normpath(os.path.join(demo_dir(path), os.path.pardir)) diff --git a/src/data/data.cc b/src/data/data.cc index 236bd9131..1aedd6d92 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -819,8 +819,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) { return nullptr; } -DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode, - const std::string& file_format) { +DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) { auto need_split = false; if (collective::IsFederated()) { LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers"; @@ -862,11 +861,9 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s } // legacy handling of binary data loading - if (file_format == "auto") { - DMatrix* loaded = TryLoadBinary(fname, silent); - if (loaded) { - return loaded; - } + DMatrix* loaded = TryLoadBinary(fname, silent); + if (loaded) { + return loaded; } int partid = 0, npart = 1; @@ -882,47 +879,24 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts"; } + data::ValidateFileFormat(fname); DMatrix* dmat {nullptr}; - try { - if (cache_file.empty()) { - std::unique_ptr> parser( - dmlc::Parser::Create(fname.c_str(), partid, npart, file_format.c_str())); - data::FileAdapter adapter(parser.get()); - dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), Context{}.Threads(), - cache_file, data_split_mode); - } else { - data::FileIterator iter{fname, static_cast(partid), static_cast(npart), - file_format}; - dmat = new data::SparsePageDMatrix{&iter, - iter.Proxy(), - data::fileiter::Reset, - data::fileiter::Next, - std::numeric_limits::quiet_NaN(), - 1, - cache_file}; - } - } catch (dmlc::Error& e) { - std::vector splited = common::Split(fname, '#'); - std::vector args = common::Split(splited.front(), '?'); - std::string format {file_format}; - if (args.size() == 1 && file_format == "auto") { - auto extension = common::Split(args.front(), '.').back(); - if (extension == "csv" || extension == "libsvm") { - format = extension; - } - if (format == extension) { - LOG(WARNING) - << "No format parameter is provided in input uri, but found file extension: " - << format << " . " - << "Consider providing a uri parameter: filename?format=" << format; - } else { - LOG(WARNING) - << "No format parameter is provided in input uri. " - << "Choosing default parser in dmlc-core. " - << "Consider providing a uri parameter like: filename?format=csv"; - } - } - LOG(FATAL) << "Encountered parser error:\n" << e.what(); + + if (cache_file.empty()) { + std::unique_ptr> parser( + dmlc::Parser::Create(fname.c_str(), partid, npart, "auto")); + data::FileAdapter adapter(parser.get()); + dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), Context{}.Threads(), + cache_file, data_split_mode); + } else { + data::FileIterator iter{fname, static_cast(partid), static_cast(npart)}; + dmat = new data::SparsePageDMatrix{&iter, + iter.Proxy(), + data::fileiter::Reset, + data::fileiter::Next, + std::numeric_limits::quiet_NaN(), + 1, + cache_file}; } if (need_split && data_split_mode == DataSplitMode::kCol) { diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h index 96f0e09d4..4d7239677 100644 --- a/src/data/file_iterator.h +++ b/src/data/file_iterator.h @@ -1,22 +1,50 @@ -/*! - * Copyright 2021 XGBoost contributors +/** + * Copyright 2021-2023, XGBoost contributors */ #ifndef XGBOOST_DATA_FILE_ITERATOR_H_ #define XGBOOST_DATA_FILE_ITERATOR_H_ -#include +#include #include -#include +#include #include +#include +#include "array_interface.h" #include "dmlc/data.h" #include "xgboost/c_api.h" #include "xgboost/json.h" #include "xgboost/linalg.h" -#include "array_interface.h" namespace xgboost { namespace data { +inline void ValidateFileFormat(std::string const& uri) { + std::vector name_cache = common::Split(uri, '#'); + CHECK_LE(name_cache.size(), 2) + << "Only one `#` is allowed in file path for cachefile specification"; + + std::vector name_args = common::Split(name_cache[0], '?'); + CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path."; + + StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"}; + CHECK_EQ(name_args.size(), 2) << msg; + + std::map args; + std::vector arg_list = common::Split(name_args[1], '&'); + for (size_t i = 0; i < arg_list.size(); ++i) { + std::istringstream is(arg_list[i]); + std::pair kv; + CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format" + << " for key in arg " << i + 1; + CHECK(std::getline(is, kv.second)) << "Invalid uri argument format" + << " for value in arg " << i + 1; + args.insert(kv); + } + if (args.find("format") == args.cend()) { + LOG(FATAL) << msg; + } +} + /** * An iterator for implementing external memory support with file inputs. Users of * external memory are encouraged to define their own file parsers/loaders so this one is @@ -31,8 +59,6 @@ class FileIterator { uint32_t part_idx_; // Equals to total number of workers. uint32_t n_parts_; - // Format of the input file, like "libsvm". - std::string type_; DMatrixHandle proxy_; @@ -45,10 +71,9 @@ class FileIterator { std::string indices_; public: - FileIterator(std::string uri, unsigned part_index, unsigned num_parts, - std::string type) - : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts}, - type_{std::move(type)} { + FileIterator(std::string uri, unsigned part_index, unsigned num_parts) + : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} { + ValidateFileFormat(uri_); XGProxyDMatrixCreate(&proxy_); } ~FileIterator() { @@ -94,9 +119,7 @@ class FileIterator { auto Proxy() -> decltype(proxy_) { return proxy_; } void Reset() { - CHECK(!type_.empty()); - parser_.reset(dmlc::Parser::Create(uri_.c_str(), part_idx_, - n_parts_, type_.c_str())); + parser_.reset(dmlc::Parser::Create(uri_.c_str(), part_idx_, n_parts_, "auto")); } }; diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index ccfdbff52..b8de641ff 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -88,7 +88,8 @@ inline std::shared_ptr GetExternalMemoryDMatrixFromData( fo << row_data.str() << "\n"; } fo.close(); - return std::shared_ptr(DMatrix::Load(tmp_file + "#" + tmp_file + ".cache")); + return std::shared_ptr( + DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache")); } // Test that elements are approximately equally distributed among bins diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc index 31da2c1fa..bd8c4b9c2 100644 --- a/tests/cpp/data/test_file_iterator.cc +++ b/tests/cpp/data/test_file_iterator.cc @@ -29,16 +29,16 @@ TEST(FileIterator, Basic) { { auto zpath = tmpdir.path + "/0-based.svm"; CreateBigTestData(zpath, 3 * 64, true); - zpath += "?indexing_mode=0"; - FileIterator iter{zpath, 0, 1, "libsvm"}; + zpath += "?indexing_mode=0&format=libsvm"; + FileIterator iter{zpath, 0, 1}; check_n_features(&iter); } { auto opath = tmpdir.path + "/1-based.svm"; CreateBigTestData(opath, 3 * 64, false); - opath += "?indexing_mode=1"; - FileIterator iter{opath, 0, 1, "libsvm"}; + opath += "?indexing_mode=1&format=libsvm"; + FileIterator iter{opath, 0, 1}; check_n_features(&iter); } } diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index 895844180..5ebe1c6bd 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) { dmlc::TemporaryDirectory tempdir; std::string tmp_file = tempdir.path + "/qid_test.libsvm"; { - std::unique_ptr fs( - dmlc::Stream::Create(tmp_file.c_str(), "w")); + std::unique_ptr fs(dmlc::Stream::Create(tmp_file.c_str(), "w")); dmlc::ostream os(fs.get()); os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0 2 qid:1 1:0 2:0 3:1 4:0.1 5:1 @@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) { os.set_stream(nullptr); } std::unique_ptr dmat( - xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm")); + xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow)); const xgboost::MetaInfo& info = dmat->Info(); const std::vector expected_group_ptr{0, 4, 8, 12}; diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index a37352626..3bdbf5403 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -17,11 +17,15 @@ using namespace xgboost; // NOLINT +namespace { +std::string UriSVM(std::string name) { return name + "?format=libsvm"; } +} // namespace + TEST(SimpleDMatrix, MetaInfo) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); // Test the metadata that was parsed EXPECT_EQ(dmat->Info().num_row_, 2); @@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false); // Loop over the batches and count the records int64_t row_count = 0; @@ -60,7 +64,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); ASSERT_TRUE(dmat->SingleColBlock()); @@ -387,7 +391,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file); + xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file)); data::SimpleDMatrix *simple_dmat = dynamic_cast(dmat); const std::string tmp_binfile = tempdir.path + "/csr_source.binary"; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index 24dc40949..608c32947 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -16,14 +16,19 @@ #include "../helpers.h" using namespace xgboost; // NOLINT +namespace { +std::string UriSVM(std::string name, std::string cache) { + return name + "?format=libsvm" + "#" + cache + ".cache"; +} +} // namespace template void TestSparseDMatrixLoadFile() { dmlc::TemporaryDirectory tmpdir; auto opath = tmpdir.path + "/1-based.svm"; CreateBigTestData(opath, 3 * 64, false); - opath += "?indexing_mode=1"; - data::FileIterator iter{opath, 0, 1, "libsvm"}; + opath += "?indexing_mode=1&format=libsvm"; + data::FileIterator iter{opath, 0, 1}; auto n_threads = 0; data::SparsePageDMatrix m{&iter, iter.Proxy(), @@ -112,15 +117,13 @@ TEST(SparsePageDMatrix, MetaInfo) { size_t constexpr kEntries = 24; CreateBigTestData(tmp_file, kEntries); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false); + std::unique_ptr dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)}; // Test the metadata that was parsed EXPECT_EQ(dmat->Info().num_row_, 8ul); EXPECT_EQ(dmat->Info().num_col_, 5ul); EXPECT_EQ(dmat->Info().num_nonzero_, kEntries); EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_); - - delete dmat; } TEST(SparsePageDMatrix, RowAccess) { @@ -139,7 +142,7 @@ TEST(SparsePageDMatrix, ColAccess) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"); + xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file)); // Loop over the batches and assert the data is as expected size_t iter = 0; @@ -231,7 +234,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) { std::string filename = tempdir.path + "/simple.libsvm"; CreateBigTestData(filename, 1 << 16); - data::FileIterator iter(filename, 0, 1, "auto"); + data::FileIterator iter(filename + "?format=libsvm", 0, 1); std::unique_ptr sparse{ new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, std::numeric_limits::quiet_NaN(), threads, filename}}; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index bb562ffb7..55a44e458 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -13,7 +13,7 @@ TEST(SparsePageDMatrix, EllpackPage) { dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; CreateSimpleTestData(tmp_file); - DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"); + DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"); // Loop over the batches and assert the data is as expected size_t n = 0; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 76fd2f967..7c81b96f9 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -548,7 +548,7 @@ std::unique_ptr CreateSparsePageDMatrixWithRC( } fo.close(); - std::string uri = tmp_file; + std::string uri = tmp_file + "?format=libsvm"; if (page_size > 0) { uri += "#" + tmp_file + ".cache"; } diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 91e8070c2..a3bb30fcd 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -126,7 +126,8 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/big.libsvm"; CreateBigTestData(tmp_file, 50000); - std::shared_ptr dmat(xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache")); + std::shared_ptr dmat( + xgboost::DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache")); EXPECT_FALSE(dmat->SingleColBlock()); size_t num_row = dmat->Info().num_row_; std::vector labels(num_row); diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index fab2a6eca..e512e4bc6 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -21,8 +21,7 @@ class TestBasic: assert not lazy_isinstance(a, 'numpy', 'dataframe') def test_basic(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} # specify validations set to watch performance @@ -61,8 +60,7 @@ class TestBasic: def test_metric_config(self): # Make sure that the metric configuration happens in booster so the # string `['error', 'auc']` doesn't get passed down to core. - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']} watchlist = [(dtest, 'eval'), (dtrain, 'train')] @@ -78,8 +76,7 @@ class TestBasic: np.testing.assert_allclose(predt_0, predt_1) def test_multiclass(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2} # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] @@ -188,7 +185,7 @@ class TestBasic: assert dm.num_col() == cols def test_cv(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} @@ -198,7 +195,7 @@ class TestBasic: assert len(cv) == (4) def test_cv_no_shuffle(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} @@ -209,7 +206,7 @@ class TestBasic: assert len(cv) == (4) def test_cv_explicit_fold_indices(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} folds = [ @@ -268,8 +265,7 @@ class TestBasicPathLike: def test_DMatrix_init_from_path(self): """Initialization from the data path.""" - dpath = Path('demo/data') - dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) assert dtrain.num_row() == 6513 assert dtrain.num_col() == 127 diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index d76205593..610a9236e 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -42,8 +42,7 @@ class TestModels: param = {'verbosity': 0, 'objective': 'binary:logistic', 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, 'nthread': 1} - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) @@ -55,8 +54,7 @@ class TestModels: assert err < 0.2 def test_dart(self): - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) param = {'max_depth': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1} # specify validations set to watch performance @@ -122,7 +120,7 @@ class TestModels: def test_boost_from_prediction(self): # Re-construct dtrain here to avoid modification - margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + margined, _ = tm.load_agaricus(__file__) bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_0 = bst.predict(margined, output_margin=True) margined.set_base_margin(predt_0) @@ -130,13 +128,13 @@ class TestModels: predt_1 = bst.predict(margined) assert np.any(np.abs(predt_1 - predt_0) > 1e-6) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) bst = xgb.train({'tree_method': 'hist'}, dtrain, 2) predt_2 = bst.predict(dtrain) assert np.all(np.abs(predt_2 - predt_1) < 1e-6) def test_boost_from_existing_model(self): - X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + X, _ = tm.load_agaricus(__file__) booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4) assert booster.num_boosted_rounds() == 4 booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4, @@ -156,8 +154,7 @@ class TestModels: 'objective': 'reg:logistic', "tree_method": tree_method } - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 10 @@ -203,8 +200,7 @@ class TestModels: self.run_custom_objective() def test_multi_eval_metric(self): - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1, 'objective': 'binary:logistic'} @@ -226,7 +222,7 @@ class TestModels: param['scale_pos_weight'] = ratio return (dtrain, dtest, param) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc) @@ -234,7 +230,7 @@ class TestModels: param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic'} num_round = 2 - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False) @@ -392,7 +388,7 @@ class TestModels: os.remove(model_path) try: - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) + dtrain, _ = tm.load_agaricus(__file__) xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1) except ValueError as e: e_str = str(e) diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index e8375aa5e..d3ec05e6e 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -275,9 +275,7 @@ class TestCallbacks: """Test learning rate scheduler, used by both CPU and GPU tests.""" scheduler = xgb.callback.LearningRateScheduler - dpath = tm.data_dir(__file__) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 @@ -361,9 +359,7 @@ class TestCallbacks: num_round = 4 scheduler = xgb.callback.LearningRateScheduler - dpath = tm.data_dir(__file__) - dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train")) - dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test")) + dtrain, dtest = tm.load_agaricus(__file__) watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = { diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index ef56ff656..bcc089afb 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -283,7 +283,7 @@ class TestDMatrix: assert m0.feature_types == m1.feature_types def test_get_info(self): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) dtrain.get_float_info('label') dtrain.get_float_info('weight') dtrain.get_float_info('base_margin') @@ -432,7 +432,9 @@ class TestDMatrix: def test_uri_categorical(self): path = os.path.join(dpath, 'agaricus.txt.train') feature_types = ["q"] * 5 + ["c"] + ["q"] * 120 - Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types) + Xy = xgb.DMatrix( + path + "?indexing_mode=1&format=libsvm", feature_types=feature_types + ) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) def test_base_margin(self): diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py index 96d2ba7dc..5eaaf1f8c 100644 --- a/tests/python/test_interaction_constraints.py +++ b/tests/python/test_interaction_constraints.py @@ -88,8 +88,12 @@ class TestInteractionConstraints: def training_accuracy(self, tree_method): """Test accuracy, reused by GPU tests.""" from sklearn.metrics import accuracy_score - dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') - dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') + dtrain = xgboost.DMatrix( + dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm" + ) + dtest = xgboost.DMatrix( + dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm" + ) params = { 'eta': 1, 'max_depth': 6, diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py index 4dbfaa60d..a3785f1cb 100644 --- a/tests/python/test_monotone_constraints.py +++ b/tests/python/test_monotone_constraints.py @@ -134,8 +134,8 @@ class TestMonotoneConstraints: @pytest.mark.skipif(**tm.no_sklearn()) def test_training_accuracy(self): from sklearn.metrics import accuracy_score - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') + dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm") + dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm") params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': 'hist', 'monotone_constraints': '(1, 0)'} num_boost_round = 5 diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py index c53363736..82b0ba270 100644 --- a/tests/python/test_openmp.py +++ b/tests/python/test_openmp.py @@ -13,9 +13,7 @@ pytestmark = tm.timeout(10) class TestOMP: def test_omp(self): - dpath = 'demo/data/' - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain, dtest = tm.load_agaricus(__file__) param = {'booster': 'gbtree', 'objective': 'binary:logistic', diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py index 885c0f1e2..9d80d0f6f 100644 --- a/tests/python/test_parse_tree.py +++ b/tests/python/test_parse_tree.py @@ -13,7 +13,7 @@ rng = np.random.RandomState(1994) class TestTreesToDataFrame: def build_model(self, max_depth, num_round): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtrain, _ = tm.load_agaricus(__file__) param = {'max_depth': max_depth, 'objective': 'binary:logistic', 'verbosity': 1} num_round = num_round diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py index dc45cd254..303c7c8c1 100644 --- a/tests/python/test_plotting.py +++ b/tests/python/test_plotting.py @@ -17,12 +17,10 @@ except ImportError: pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz())) -dpath = 'demo/data/agaricus.txt.train' - class TestPlotting: def test_plotting(self): - m = xgb.DMatrix(dpath) + m, _ = tm.load_agaricus(__file__) booster = xgb.train({'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}, m, num_boost_round=2) diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index 4d861ad6e..2585da088 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -46,8 +46,8 @@ class TestSHAP: fscores = bst.get_fscore() assert scores1 == fscores - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm') def fn(max_depth, num_rounds): # train diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index dd710f6a4..78097a4ea 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -154,9 +154,7 @@ class TestTreeMethod: def test_hist_categorical(self): # hist must be same as exact on all-categorial data - dpath = 'demo/data/' - ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + ag_dtrain, ag_dtest = tm.load_agaricus(__file__) ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 07295eb6c..f8a21b6ab 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -222,7 +222,7 @@ class TestPandas: set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist") def test_cv_as_pandas(self): - dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + dm, _ = tm.load_agaricus(__file__) params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'error'}