From ac7a9edb06f2641bc68ba8edd4ce241259e20cbc Mon Sep 17 00:00:00 2001 From: Seth Hendrickson Date: Fri, 20 Oct 2017 14:18:46 -0700 Subject: [PATCH] remove stale examples (#2816) --- jvm-packages/README.md | 157 ++--------------------------------------- 1 file changed, 6 insertions(+), 151 deletions(-) diff --git a/jvm-packages/README.md b/jvm-packages/README.md index 596f4fdf6..920f2fe06 100644 --- a/jvm-packages/README.md +++ b/jvm-packages/README.md @@ -10,158 +10,13 @@ XGBoost4J is the JVM package of xgboost. It brings all the optimizations and power xgboost into JVM ecosystem. -- Train XGBoost models on scala and java with easy customizations. -- Run distributed xgboost natively on jvm frameworks such as Flink and Spark. +- Train XGBoost models in scala and java with easy customizations. +- Run distributed xgboost natively on jvm frameworks such as +Apache Flink and Apache Spark. You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.org/en/latest/jvm/index.html) and [Resource Page](../demo/README.md). -## Hello World -**NOTE on LIBSVM Format**: -- Use *1-based* ascending indexes for the LIBSVM format in distributed training mode - - - Spark does the internal conversion, and does not accept formats that are 0-based -- Whereas, use *0-based* indexes format when predicting in normal mode - for instance, while using the saved model in the Python package - -### XGBoost Scala -```scala -import ml.dmlc.xgboost4j.scala.DMatrix -import ml.dmlc.xgboost4j.scala.XGBoost - -object XGBoostScalaExample { - def main(args: Array[String]) { - // read trainining data, available at xgboost/demo/data - val trainData = - new DMatrix("/path/to/agaricus.txt.train") - // define parameters - val paramMap = List( - "eta" -> 0.1, - "max_depth" -> 2, - "objective" -> "binary:logistic").toMap - // number of iterations - val round = 2 - // train the model - val model = XGBoost.train(trainData, paramMap, round) - // run prediction - val predTrain = model.predict(trainData) - // save model to the file. - model.saveModel("/local/path/to/model") - } -} -``` - -### XGBoost Spark - -XGBoost4J-Spark supports training XGBoost model through RDD and Dataframe - -RDD Version: - -```scala -import org.apache.spark.SparkContext -import org.apache.spark.mllib.util.MLUtils -import ml.dmlc.xgboost4j.scala.spark.XGBoost - -object SparkWithRDD { - def main(args: Array[String]): Unit = { - if (args.length != 3) { - println( - "usage: program num_of_rounds training_path model_path") - sys.exit(1) - } - // if you do not want to use KryoSerializer in Spark, you can ignore the related configuration - val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example") - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - sparkConf.registerKryoClasses(Array(classOf[Booster])) - val sc = new SparkContext(sparkConf) - val inputTrainPath = args(1) - val outputModelPath = args(2) - // number of iterations - val numRound = args(0).toInt - val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath) - // training parameters - val paramMap = List( - "eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "binary:logistic").toMap - // use 5 distributed workers to train the model - // useExternalMemory indicates whether - val model = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = 5, useExternalMemory = true) - // save model to HDFS path - model.saveModelToHadoop(outputModelPath) - } -} -``` - -Dataframe Version: - -```scala -object SparkWithDataFrame { - def main(args: Array[String]): Unit = { - if (args.length != 5) { - println( - "usage: program num_of_rounds num_workers training_path test_path model_path") - sys.exit(1) - } - // create SparkSession - val sparkConf = new SparkConf().setAppName("XGBoost-spark-example") - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - sparkConf.registerKryoClasses(Array(classOf[Booster])) - val sparkSession = SparkSession.builder().appName("XGBoost-spark-example").config(sparkConf). - getOrCreate() - // create training and testing dataframes - val inputTrainPath = args(2) - val inputTestPath = args(3) - val outputModelPath = args(4) - // number of iterations - val numRound = args(0).toInt - import DataUtils._ - val trainRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTrainPath). - map{ labeledPoint => Row(labeledPoint.features, labeledPoint.label)} - val trainDF = sparkSession.createDataFrame(trainRDDOfRows, StructType( - Array(StructField("features", ArrayType(FloatType)), StructField("label", IntegerType)))) - val testRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTestPath). - zipWithIndex().map{ case (labeledPoint, id) => - Row(id, labeledPoint.features, labeledPoint.label)} - val testDF = sparkSession.createDataFrame(testRDDOfRows, StructType( - Array(StructField("id", LongType), - StructField("features", ArrayType(FloatType)), StructField("label", IntegerType)))) - // training parameters - val paramMap = List( - "eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "binary:logistic").toMap - val xgboostModel = XGBoost.trainWithDataset( - trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true) - // xgboost-spark appends the column containing prediction results - xgboostModel.transform(testDF).show() - } -} -``` - -### XGBoost Flink -```scala -import ml.dmlc.xgboost4j.scala.flink.XGBoost -import org.apache.flink.api.scala._ -import org.apache.flink.api.scala.ExecutionEnvironment -import org.apache.flink.ml.MLUtils - -object DistTrainWithFlink { - def main(args: Array[String]) { - val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment - // read trainining data - val trainData = - MLUtils.readLibSVM(env, "/path/to/data/agaricus.txt.train") - // define parameters - val paramMap = List( - "eta" -> 0.1, - "max_depth" -> 2, - "objective" -> "binary:logistic").toMap - // number of iterations - val round = 2 - // train the model - val model = XGBoost.train(trainData, paramMap, round) - val predTrain = model.predict(trainData.map{x => x.vector}) - model.saveModelToHadoop("file:///path/to/xgboost.model") - } -} -``` - +## Examples +Full code examples for Scala, Java, Apache Spark, and Apache Flink can +be found in the [examples package](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example).