[jvm-packages] Integration with Spark Dataframe/Dataset (#1559)
* bump up to scala 2.11 * framework of data frame integration * test consistency between RDD and DataFrame * order preservation * test order preservation * example code and fix makefile * improve type checking * improve APIs * user docs * work around travis CI's limitation on log length * adjust test structure * integrate with Spark -1 .x * spark 2.x integration * remove spark 1.x implementation but provide instructions on how to downgrade
This commit is contained in:
@@ -49,12 +49,17 @@ object XGBoostScalaExample {
|
||||
```
|
||||
|
||||
### XGBoost Spark
|
||||
|
||||
XGBoost4J-Spark supports training XGBoost model through RDD and Dataframe
|
||||
|
||||
RDD Version:
|
||||
|
||||
```scala
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.mllib.util.MLUtils
|
||||
import ml.dmlc.xgboost4j.scala.spark.XGBoost
|
||||
|
||||
object DistTrainWithSpark {
|
||||
object SparkWithRDD {
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length != 3) {
|
||||
println(
|
||||
@@ -85,6 +90,52 @@ object DistTrainWithSpark {
|
||||
}
|
||||
```
|
||||
|
||||
Dataframe Version:
|
||||
|
||||
```scala
|
||||
object SparkWithDataFrame {
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length != 5) {
|
||||
println(
|
||||
"usage: program num_of_rounds num_workers training_path test_path model_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
// create SparkSession
|
||||
val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val sparkSession = SparkSession.builder().appName("XGBoost-spark-example").config(sparkConf).
|
||||
getOrCreate()
|
||||
// create training and testing dataframes
|
||||
val inputTrainPath = args(2)
|
||||
val inputTestPath = args(3)
|
||||
val outputModelPath = args(4)
|
||||
// number of iterations
|
||||
val numRound = args(0).toInt
|
||||
import DataUtils._
|
||||
val trainRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTrainPath).
|
||||
map{ labeledPoint => Row(labeledPoint.features, labeledPoint.label)}
|
||||
val trainDF = sparkSession.createDataFrame(trainRDDOfRows, StructType(
|
||||
Array(StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
|
||||
val testRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTestPath).
|
||||
zipWithIndex().map{ case (labeledPoint, id) =>
|
||||
Row(id, labeledPoint.features, labeledPoint.label)}
|
||||
val testDF = sparkSession.createDataFrame(testRDDOfRows, StructType(
|
||||
Array(StructField("id", LongType),
|
||||
StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
|
||||
// training parameters
|
||||
val paramMap = List(
|
||||
"eta" -> 0.1f,
|
||||
"max_depth" -> 2,
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgboostModel = XGBoost.trainWithDataset(
|
||||
trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
|
||||
// xgboost-spark appends the column containing prediction results
|
||||
xgboostModel.transform(testDF).show()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### XGBoost Flink
|
||||
```scala
|
||||
import ml.dmlc.xgboost4j.scala.flink.XGBoost
|
||||
|
||||
Reference in New Issue
Block a user