[jvm-packages]support multiple validation datasets in Spark (#3910)
* add back train method but mark as deprecated * add back train method but mark as deprecated * add back train method but mark as deprecated * add back train method but mark as deprecated * fix scalastyle error * fix scalastyle error * fix scalastyle error * fix scalastyle error * wrap iterators * enable copartition training and validationset * add parameters * converge code path and have init unit test * enable multi evals for ranking * unit test and doc * update example * fix early stopping * address the offline comments * udpate doc * test eval metrics * fix compilation issue * fix example
This commit is contained in:
@@ -137,7 +137,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
assert(model.summary.validationObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("XGBoost and Spark parameters synchronize correctly") {
|
||||
@@ -191,31 +191,6 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
assert(count != 0)
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
|
||||
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
|
||||
assert(testObjectiveHistory.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
|
||||
}
|
||||
|
||||
test("test predictionLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
|
||||
@@ -277,4 +277,93 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
|
||||
assert(booster != null)
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
|
||||
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.validationObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
assert(model.summary.validationObjectiveHistory.length === 1)
|
||||
assert(model.summary.validationObjectiveHistory(0)._1 === "test")
|
||||
assert(model.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== model.summary.validationObjectiveHistory(0))
|
||||
}
|
||||
|
||||
test("train with multiple validation datasets (non-ranking)") {
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
|
||||
val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val xgb1 = new XGBoostClassifier(paramMap1)
|
||||
xgb1.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
|
||||
val model1 = xgb1.fit(train)
|
||||
assert(model1.summary.validationObjectiveHistory.length === 2)
|
||||
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
|
||||
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
|
||||
|
||||
val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
|
||||
val xgb2 = new XGBoostClassifier(paramMap2)
|
||||
val model2 = xgb2.fit(train)
|
||||
assert(model2.summary.validationObjectiveHistory.length === 2)
|
||||
assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
|
||||
assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
|
||||
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
|
||||
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
|
||||
}
|
||||
|
||||
test("train with multiple validation datasets (ranking)") {
|
||||
val training = buildDataFrameWithGroup(Ranking.train, 5)
|
||||
val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
|
||||
val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group")
|
||||
val xgb1 = new XGBoostRegressor(paramMap1)
|
||||
xgb1.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
|
||||
val model1 = xgb1.fit(train)
|
||||
assert(model1 != null)
|
||||
assert(model1.summary.validationObjectiveHistory.length === 2)
|
||||
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
|
||||
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
|
||||
|
||||
val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group",
|
||||
"eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
|
||||
val xgb2 = new XGBoostRegressor(paramMap2)
|
||||
val model2 = xgb2.fit(train)
|
||||
assert(model2 != null)
|
||||
assert(model2.summary.validationObjectiveHistory.length === 2)
|
||||
assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
|
||||
assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
|
||||
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
|
||||
assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user