[jvm-packages]support multiple validation datasets in Spark (#3910)

* add back train method but mark as deprecated

* add back train method but mark as deprecated

* add back train method but mark as deprecated

* add back train method but mark as deprecated

* fix scalastyle error

* fix scalastyle error

* fix scalastyle error

* fix scalastyle error

* wrap iterators

* enable copartition training and validationset

* add parameters

* converge code path and have init unit test

* enable multi evals for ranking

* unit test and doc

* update example

* fix early stopping

* address the offline comments

* udpate doc

* test eval metrics

* fix compilation issue

* fix example
This commit is contained in:
Nan Zhu
2018-12-17 21:03:57 -08:00
committed by GitHub
parent c8c7b9649c
commit c055a32609
14 changed files with 477 additions and 136 deletions

View File

@@ -137,7 +137,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
assert(predictionDF.columns.contains("final_prediction") === false)
assert(model.summary.trainObjectiveHistory.length === 5)
assert(model.summary.testObjectiveHistory.isEmpty)
assert(model.summary.validationObjectiveHistory.isEmpty)
}
test("XGBoost and Spark parameters synchronize correctly") {
@@ -191,31 +191,6 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
assert(count != 0)
}
test("training summary") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
val trainingDF = buildDataFrame(Classification.train)
val xgb = new XGBoostClassifier(paramMap)
val model = xgb.fit(trainingDF)
assert(model.summary.trainObjectiveHistory.length === 5)
assert(model.summary.testObjectiveHistory.isEmpty)
}
test("train/test split") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
"num_round" -> 5, "num_workers" -> numWorkers)
val training = buildDataFrame(Classification.train)
val xgb = new XGBoostClassifier(paramMap)
val model = xgb.fit(training)
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
assert(testObjectiveHistory.length === 5)
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
}
test("test predictionLeaf") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",

View File

@@ -277,4 +277,93 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
assert(booster != null)
}
test("training summary") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
val trainingDF = buildDataFrame(Classification.train)
val xgb = new XGBoostClassifier(paramMap)
val model = xgb.fit(trainingDF)
assert(model.summary.trainObjectiveHistory.length === 5)
assert(model.summary.validationObjectiveHistory.isEmpty)
}
test("train/test split") {
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
"num_round" -> 5, "num_workers" -> numWorkers)
val training = buildDataFrame(Classification.train)
val xgb = new XGBoostClassifier(paramMap)
val model = xgb.fit(training)
assert(model.summary.validationObjectiveHistory.length === 1)
assert(model.summary.validationObjectiveHistory(0)._1 === "test")
assert(model.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model.summary.trainObjectiveHistory !== model.summary.validationObjectiveHistory(0))
}
test("train with multiple validation datasets (non-ranking)") {
val training = buildDataFrame(Classification.train)
val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> numWorkers)
val xgb1 = new XGBoostClassifier(paramMap1)
xgb1.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
val model1 = xgb1.fit(train)
assert(model1.summary.validationObjectiveHistory.length === 2)
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic",
"num_round" -> 5, "num_workers" -> numWorkers,
"eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
val xgb2 = new XGBoostClassifier(paramMap2)
val model2 = xgb2.fit(train)
assert(model2.summary.validationObjectiveHistory.length === 2)
assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
}
test("train with multiple validation datasets (ranking)") {
val training = buildDataFrameWithGroup(Ranking.train, 5)
val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "rank:pairwise",
"num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group")
val xgb1 = new XGBoostRegressor(paramMap1)
xgb1.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
val model1 = xgb1.fit(train)
assert(model1 != null)
assert(model1.summary.validationObjectiveHistory.length === 2)
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "rank:pairwise",
"num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group",
"eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
val xgb2 = new XGBoostRegressor(paramMap2)
val model2 = xgb2.fit(train)
assert(model2 != null)
assert(model2.summary.validationObjectiveHistory.length === 2)
assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
}
}