[jvm-packages] allow partial evaluation of dataframe before prediction (#4407)
* allow partial evaluation of dataframe before prediction * resume spark test * comments * Run unit tests after building JVM packages
This commit is contained in:
parent
ea850ecd20
commit
37dc82c3ff
@ -78,7 +78,7 @@ else ()
|
|||||||
endif (MINGW OR R_LIB)
|
endif (MINGW OR R_LIB)
|
||||||
add_library(rabit STATIC ${RABIT_SOURCES})
|
add_library(rabit STATIC ${RABIT_SOURCES})
|
||||||
target_include_directories(rabit PRIVATE
|
target_include_directories(rabit PRIVATE
|
||||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/rabit/include>
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
|
||||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/rabit/include/rabit>)
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/rabit/include/rabit>)
|
||||||
set_target_properties(rabit
|
set_target_properties(rabit
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
Subproject commit ac983092ee3b339f76a2d7e7c3b846570218200d
|
Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b
|
||||||
@ -16,14 +16,73 @@
|
|||||||
|
|
||||||
package ml.dmlc.xgboost4j.scala.spark
|
package ml.dmlc.xgboost4j.scala.spark
|
||||||
|
|
||||||
|
import java.util.concurrent.LinkedBlockingDeque
|
||||||
|
|
||||||
|
import scala.util.Random
|
||||||
|
|
||||||
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, RabitTracker => PyRabitTracker}
|
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, RabitTracker => PyRabitTracker}
|
||||||
import ml.dmlc.xgboost4j.scala.rabit.{RabitTracker => ScalaRabitTracker}
|
import ml.dmlc.xgboost4j.scala.rabit.{RabitTracker => ScalaRabitTracker}
|
||||||
import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
|
import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
|
||||||
|
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||||
|
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
import org.scalatest.FunSuite
|
import org.scalatest.FunSuite
|
||||||
|
|
||||||
|
|
||||||
class RabitTrackerRobustnessSuite extends FunSuite with PerTest {
|
class RabitSuite extends FunSuite with PerTest {
|
||||||
|
|
||||||
|
test("training with Scala-implemented Rabit tracker") {
|
||||||
|
val eval = new EvalError()
|
||||||
|
val training = buildDataFrame(Classification.train)
|
||||||
|
val testDM = new DMatrix(Classification.test.iterator)
|
||||||
|
val paramMap = Map("eta" -> "1", "max_depth" -> "6",
|
||||||
|
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||||
|
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
||||||
|
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||||
|
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
|
||||||
|
val vectorLength = 100
|
||||||
|
val rdd = sc.parallelize(
|
||||||
|
(1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
|
||||||
|
|
||||||
|
val tracker = new ScalaRabitTracker(numWorkers)
|
||||||
|
tracker.start(0)
|
||||||
|
val trackerEnvs = tracker.getWorkerEnvs
|
||||||
|
val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
|
||||||
|
|
||||||
|
val rawData = rdd.mapPartitions { iter =>
|
||||||
|
Iterator(iter.toArray)
|
||||||
|
}.collect()
|
||||||
|
|
||||||
|
val maxVec = (0 until vectorLength).toArray.map { j =>
|
||||||
|
(0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
|
||||||
|
}
|
||||||
|
|
||||||
|
val allReduceResults = rdd.mapPartitions { iter =>
|
||||||
|
Rabit.init(trackerEnvs)
|
||||||
|
val arr = iter.toArray
|
||||||
|
val results = Rabit.allReduce(arr, Rabit.OpType.MAX)
|
||||||
|
Rabit.shutdown()
|
||||||
|
Iterator(results)
|
||||||
|
}.cache()
|
||||||
|
|
||||||
|
val sparkThread = new Thread() {
|
||||||
|
override def run(): Unit = {
|
||||||
|
allReduceResults.foreachPartition(() => _)
|
||||||
|
val byPartitionResults = allReduceResults.collect()
|
||||||
|
assert(byPartitionResults(0).length == vectorLength)
|
||||||
|
collectedAllReduceResults.put(byPartitionResults(0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sparkThread.start()
|
||||||
|
assert(tracker.waitFor(0L) == 0)
|
||||||
|
sparkThread.join()
|
||||||
|
|
||||||
|
assert(collectedAllReduceResults.poll().sameElements(maxVec))
|
||||||
|
}
|
||||||
|
|
||||||
test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
|
test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
|
||||||
/*
|
/*
|
||||||
Deliberately create new instances of SparkContext in each unit test to avoid reusing the
|
Deliberately create new instances of SparkContext in each unit test to avoid reusing the
|
||||||
@ -148,4 +207,23 @@ class RabitTrackerRobustnessSuite extends FunSuite with PerTest {
|
|||||||
// should fail due to connection timeout
|
// should fail due to connection timeout
|
||||||
assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
|
assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("should allow the dataframe containing rabit calls to be partially evaluated for" +
|
||||||
|
" multiple times (ISSUE-4406)") {
|
||||||
|
val paramMap = Map(
|
||||||
|
"eta" -> "1",
|
||||||
|
"max_depth" -> "6",
|
||||||
|
"silent" -> "1",
|
||||||
|
"objective" -> "binary:logistic")
|
||||||
|
val trainingDF = buildDataFrame(Classification.train)
|
||||||
|
val model = new XGBoostClassifier(paramMap ++ Array("num_round" -> 10,
|
||||||
|
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||||
|
val prediction = model.transform(trainingDF)
|
||||||
|
// a partial evaluation of dataframe will cause rabit initialized but not shutdown in some
|
||||||
|
// threads
|
||||||
|
prediction.show()
|
||||||
|
// a full evaluation here will re-run init and shutdown all rabit proxy
|
||||||
|
// expecting no error
|
||||||
|
prediction.collect()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@ -17,67 +17,17 @@
|
|||||||
package ml.dmlc.xgboost4j.scala.spark
|
package ml.dmlc.xgboost4j.scala.spark
|
||||||
|
|
||||||
import java.nio.file.Files
|
import java.nio.file.Files
|
||||||
import java.util.concurrent.LinkedBlockingDeque
|
|
||||||
|
|
||||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
|
||||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||||
|
|
||||||
import org.apache.spark.TaskContext
|
import org.apache.spark.TaskContext
|
||||||
import org.apache.spark.ml.linalg.Vectors
|
|
||||||
import org.apache.spark.sql._
|
|
||||||
import org.scalatest.FunSuite
|
import org.scalatest.FunSuite
|
||||||
import scala.util.Random
|
|
||||||
|
|
||||||
import ml.dmlc.xgboost4j.java.Rabit
|
|
||||||
|
|
||||||
import org.apache.spark.ml.feature.VectorAssembler
|
|
||||||
|
|
||||||
class XGBoostGeneralSuite extends FunSuite with PerTest {
|
class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||||
|
|
||||||
test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
|
|
||||||
val vectorLength = 100
|
|
||||||
val rdd = sc.parallelize(
|
|
||||||
(1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
|
|
||||||
|
|
||||||
val tracker = new RabitTracker(numWorkers)
|
|
||||||
tracker.start(0)
|
|
||||||
val trackerEnvs = tracker.getWorkerEnvs
|
|
||||||
val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
|
|
||||||
|
|
||||||
val rawData = rdd.mapPartitions { iter =>
|
|
||||||
Iterator(iter.toArray)
|
|
||||||
}.collect()
|
|
||||||
|
|
||||||
val maxVec = (0 until vectorLength).toArray.map { j =>
|
|
||||||
(0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
|
|
||||||
}
|
|
||||||
|
|
||||||
val allReduceResults = rdd.mapPartitions { iter =>
|
|
||||||
Rabit.init(trackerEnvs)
|
|
||||||
val arr = iter.toArray
|
|
||||||
val results = Rabit.allReduce(arr, Rabit.OpType.MAX)
|
|
||||||
Rabit.shutdown()
|
|
||||||
Iterator(results)
|
|
||||||
}.cache()
|
|
||||||
|
|
||||||
val sparkThread = new Thread() {
|
|
||||||
override def run(): Unit = {
|
|
||||||
allReduceResults.foreachPartition(() => _)
|
|
||||||
val byPartitionResults = allReduceResults.collect()
|
|
||||||
assert(byPartitionResults(0).length == vectorLength)
|
|
||||||
collectedAllReduceResults.put(byPartitionResults(0))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sparkThread.start()
|
|
||||||
assert(tracker.waitFor(0L) == 0)
|
|
||||||
sparkThread.join()
|
|
||||||
|
|
||||||
assert(collectedAllReduceResults.poll().sameElements(maxVec))
|
|
||||||
}
|
|
||||||
|
|
||||||
test("distributed training with the specified worker number") {
|
test("distributed training with the specified worker number") {
|
||||||
val trainingRDD = sc.parallelize(Classification.train)
|
val trainingRDD = sc.parallelize(Classification.train)
|
||||||
val (booster, metrics) = XGBoost.trainDistributed(
|
val (booster, metrics) = XGBoost.trainDistributed(
|
||||||
@ -101,18 +51,6 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
|||||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
test("training with Scala-implemented Rabit tracker") {
|
|
||||||
val eval = new EvalError()
|
|
||||||
val training = buildDataFrame(Classification.train)
|
|
||||||
val testDM = new DMatrix(Classification.test.iterator)
|
|
||||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6",
|
|
||||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
|
||||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
|
||||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
|
||||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
|
||||||
}
|
|
||||||
|
|
||||||
test("test with quantile hist with monotone_constraints (lossguide)") {
|
test("test with quantile hist with monotone_constraints (lossguide)") {
|
||||||
val eval = new EvalError()
|
val eval = new EvalError()
|
||||||
val training = buildDataFrame(Classification.train)
|
val training = buildDataFrame(Classification.train)
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.util.Map;
|
|||||||
* Rabit global class for synchronization.
|
* Rabit global class for synchronization.
|
||||||
*/
|
*/
|
||||||
public class Rabit {
|
public class Rabit {
|
||||||
|
|
||||||
public enum OpType implements Serializable {
|
public enum OpType implements Serializable {
|
||||||
MAX(0), MIN(1), SUM(2), BITWISE_OR(3);
|
MAX(0), MIN(1), SUM(2), BITWISE_OR(3);
|
||||||
|
|
||||||
|
|||||||
2
rabit
2
rabit
@ -1 +1 @@
|
|||||||
Subproject commit 1cc34f01db56d28e8e80847cf0fc5e3ecf8bb67b
|
Subproject commit a429748e244f67f6f144a697f3aa1b1978705b11
|
||||||
@ -9,7 +9,7 @@ set -x
|
|||||||
rm -rf build/
|
rm -rf build/
|
||||||
cd jvm-packages
|
cd jvm-packages
|
||||||
|
|
||||||
mvn --no-transfer-progress package -DskipTests
|
mvn --no-transfer-progress package
|
||||||
|
|
||||||
set +x
|
set +x
|
||||||
set +e
|
set +e
|
||||||
|
|||||||
@ -35,8 +35,7 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]
|
|||||||
then
|
then
|
||||||
python3 get_iris.py
|
python3 get_iris.py
|
||||||
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
|
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
|
||||||
# Disabled due to https://github.com/dmlc/xgboost/issues/4406
|
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
|
||||||
#spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set +x
|
set +x
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user