[Breaking] Require format to be specified in input URI. (#9077)

Previously, we use `libsvm` as default when format is not specified. However, the dmlc
data parser is not particularly robust against errors, and the most common type of error
is undefined format.

Along with which, we will recommend users to use other data loader instead. We will
continue the maintenance of the parsers as it's currently used for many internal tests
including federated learning.
This commit is contained in:
Jiaming Yuan
2023-04-28 19:45:15 +08:00
committed by GitHub
parent e922004329
commit 1f9a57d17b
58 changed files with 327 additions and 268 deletions

View File

@@ -30,8 +30,8 @@ import org.junit.Test;
* @author hzx
*/
public class BoosterImplTest {
private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1";
private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1";
private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm";
private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1&format=libsvm";
public static class EvalError implements IEvaluation {
@Override

View File

@@ -4,7 +4,7 @@
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
@@ -88,7 +88,7 @@ public class DMatrixTest {
public void testCreateFromFile() throws XGBoostError {
//create DMatrix from file
String filePath = writeResourceIntoTempFile("/agaricus.txt.test");
DMatrix dmat = new DMatrix(filePath);
DMatrix dmat = new DMatrix(filePath + "?format=libsvm");
//get label
float[] labels = dmat.getLabel();
//check length

View File

@@ -25,7 +25,7 @@ import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
class DMatrixSuite extends AnyFunSuite {
test("create DMatrix from File") {
val dmat = new DMatrix("../../demo/data/agaricus.txt.test")
val dmat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
// get label
val labels: Array[Float] = dmat.getLabel
// check length

View File

@@ -95,8 +95,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("basic operation of booster") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val booster = trainBooster(trainMat, testMat)
val predicts = booster.predict(testMat, true)
@@ -106,8 +106,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
test("save/load model with path") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val eval = new EvalError
val booster = trainBooster(trainMat, testMat)
// save and load
@@ -123,8 +123,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("save/load model with stream") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val eval = new EvalError
val booster = trainBooster(trainMat, testMat)
// save and load
@@ -139,7 +139,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("cross validation") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val params = List("eta" -> "1.0", "max_depth" -> "3", "silent" -> "1", "nthread" -> "6",
"objective" -> "binary:logistic", "gamma" -> "1.0", "eval_metric" -> "error").toMap
val round = 2
@@ -148,8 +148,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test with quantile histo depthwise") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val paramMap = List("max_depth" -> "3", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap
@@ -158,8 +158,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test with quantile histo lossguide") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val paramMap = List("max_depth" -> "3", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap
@@ -168,8 +168,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test with quantile histo lossguide with max bin") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val paramMap = List("max_depth" -> "3", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
@@ -179,8 +179,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test with quantile histo depthwidth with max depth") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val paramMap = List("max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
@@ -190,8 +190,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test with quantile histo depthwidth with max depth and max bin") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val paramMap = List("max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -201,7 +201,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test training from existing model in scala") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val paramMap = List("max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist",
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -213,8 +213,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
}
test("test getting number of features from a booster") {
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
val booster = trainBooster(trainMat, testMat)
TestCase.assertEquals(booster.getNumFeature, 127)