Merge pull request #922 from CodingCat/label

spark with new labeledpoint
2016-03-05 17:04:32 -08:00 · 2016-03-05 17:04:32 -08:00 · 3ddddfce79
commit 3ddddfce79
parent 74bda4bfc5 130ca7b00c
8 changed files with 8351 additions and 81 deletions
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -20,6 +20,7 @@
    <modules>
        <module>xgboost4j</module>
        <module>xgboost4j-demo</module>
        <module>xgboost4j-spark</module>
        <module>xgboost4j-flink</module>
    </modules>
    <build>
@ -118,6 +119,19 @@
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.19.1</version>
            </plugin>
            <plugin>
                <groupId>org.scalatest</groupId>
                <artifactId>scalatest-maven-plugin</artifactId>
                <version>1.0</version>
                <executions>
                    <execution>
                        <id>test</id>
                        <goals>
                            <goal>test</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
    <dependencies>
@ -150,7 +164,7 @@
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
-            <version>1.3.0</version>
+            <version>1.2.1</version>
        </dependency>
    </dependencies>
 </project>
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
@ -16,17 +16,28 @@
 package ml.dmlc.xgboost4j.scala.spark
 import java.util.{Iterator => JIterator}
 import scala.collection.mutable.ListBuffer
 import scala.collection.JavaConverters._
 import ml.dmlc.xgboost4j.java.DataBatch
 import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.regression.{LabeledPoint => SparkLabeledPoint}
 import ml.dmlc.xgboost4j.LabeledPoint
 private[spark] object DataUtils extends Serializable {
  implicit def fromSparkToXGBoostLabeledPoints(sps: Iterator[SparkLabeledPoint]):
      java.util.Iterator[LabeledPoint] = {
    (for (p <- sps) yield {
      p.features match {
        case denseFeature: DenseVector =>
          LabeledPoint.fromDenseVector(p.label.toFloat, denseFeature.values.map(_.toFloat))
        case sparseFeature: SparseVector =>
          LabeledPoint.fromSparseVector(p.label.toFloat, sparseFeature.indices,
            sparseFeature.values.map(_.toFloat))
      }
    }).asJava
  }
  private def fetchUpdateFromSparseVector(sparseFeature: SparseVector): (List[Int], List[Float]) = {
    (sparseFeature.indices.toList, sparseFeature.values.map(_.toFloat).toList)
  }
@ -37,38 +48,4 @@ private[spark] object DataUtils extends Serializable {
    case sparseFeature: SparseVector =>
      fetchUpdateFromSparseVector(sparseFeature)
  }
  def fromLabeledPointsToSparseMatrix(points: Iterator[LabeledPoint]): JIterator[DataBatch] = {
    // TODO: support weight
    var samplePos = 0
    // TODO: change hard value
    val loadingBatchSize = 100
    val rowOffset = new ListBuffer[Long]
    val label = new ListBuffer[Float]
    val featureIndices = new ListBuffer[Int]
    val featureValues = new ListBuffer[Float]
    val dataBatches = new ListBuffer[DataBatch]
    for (point <- points) {
      val (nonZeroIndices, nonZeroValues) = fetchUpdateFromVector(point.features)
      rowOffset(samplePos) = rowOffset.size
      label(samplePos) = point.label.toFloat
      for (i <- nonZeroIndices.indices) {
        featureIndices += nonZeroIndices(i)
        featureValues += nonZeroValues(i)
      }
      samplePos += 1
      if (samplePos % loadingBatchSize == 0) {
        // create a data batch
        dataBatches += new DataBatch(
          rowOffset.toArray.clone(),
          null, label.toArray.clone(), featureIndices.toArray.clone(),
          featureValues.toArray.clone())
        rowOffset.clear()
        label.clear()
        featureIndices.clear()
        featureValues.clear()
      }
    }
    dataBatches.iterator.asJava
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@ -17,41 +17,64 @@
 package ml.dmlc.xgboost4j.scala.spark
 import scala.collection.immutable.HashMap
 import scala.collection.JavaConverters._
 import com.typesafe.config.Config
-import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
+import org.apache.spark.{TaskContext, SparkContext}
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-object XGBoost {
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit, RabitTracker}
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
-  private var _sc: Option[SparkContext] = None
+object XGBoost extends Serializable {
  implicit def convertBoosterToXGBoostModel(booster: Booster): XGBoostModel = {
    new XGBoostModel(booster)
  }
-  def train(config: Config, trainingData: RDD[LabeledPoint], obj: ObjectiveTrait = null,
+  private[spark] def buildDistributedBoosters(
-      eval: EvalTrait = null): XGBoostModel = {
+      trainingData: RDD[LabeledPoint],
      xgBoostConfMap: Map[String, AnyRef],
      numWorkers: Int, round: Int, obj: ObjectiveTrait, eval: EvalTrait): RDD[Booster] = {
    import DataUtils._
    val sc = trainingData.sparkContext
-    val dataUtilsBroadcast = sc.broadcast(DataUtils)
+    val tracker = new RabitTracker(numWorkers)
-    val filePath = config.getString("inputPath") // configuration entry name to be fixed
+    if (tracker.start()) {
      trainingData.repartition(numWorkers).mapPartitions {
        trainingSamples =>
          Rabit.init(new java.util.HashMap[String, String]() {
            put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
          })
          val dMatrix = new DMatrix(new JDMatrix(trainingSamples, null))
          val booster = SXGBoost.train(xgBoostConfMap, dMatrix, round,
            watches = new HashMap[String, DMatrix], obj, eval)
          Rabit.shutdown()
          Iterator(booster)
      }.cache()
    } else {
      null
    }
  }
  def train(config: Config, trainingData: RDD[LabeledPoint], obj: ObjectiveTrait = null,
      eval: EvalTrait = null): Option[XGBoostModel] = {
    import DataUtils._
    val numWorkers = config.getInt("numWorkers")
    val round = config.getInt("round")
-    // TODO: build configuration map from config
+    val sc = trainingData.sparkContext
-    val xgBoostConfigMap = new HashMap[String, AnyRef]()
+    val tracker = new RabitTracker(numWorkers)
-    val boosters = trainingData.repartition(numWorkers).mapPartitions {
+    if (tracker.start()) {
-      trainingSamples =>
+      // TODO: build configuration map from config
-        val dataBatches = dataUtilsBroadcast.value.fromLabeledPointsToSparseMatrix(trainingSamples)
+      val xgBoostConfigMap = new HashMap[String, AnyRef]()
-        val dMatrix = new DMatrix(new JDMatrix(dataBatches, null))
+      val boosters = buildDistributedBoosters(trainingData, xgBoostConfigMap, numWorkers, round,
-        Iterator(SXGBoost.train(xgBoostConfigMap, dMatrix, round, watches = null, obj, eval))
+        obj, eval)
-    }.cache()
+      // force the job
-    // force the job
+      sc.runJob(boosters, (boosters: Iterator[Booster]) => boosters)
-    sc.runJob(boosters, (boosters: Iterator[Booster]) => boosters)
+      tracker.waitFor()
-    // TODO: how to choose best model
+      // TODO: how to choose best model
-    boosters.first()
+      Some(boosters.first())
    } else {
      None
    }
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
@ -16,22 +16,25 @@
 package ml.dmlc.xgboost4j.scala.spark
-import scala.collection.JavaConverters._
+import org.apache.spark.mllib.regression.{LabeledPoint => SparkLabeledPoint}
 import org.apache.spark.rdd.RDD
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 import ml.dmlc.xgboost4j.scala.{DMatrix, Booster}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 class XGBoostModel(booster: Booster) extends Serializable {
-  def predict(testSet: RDD[LabeledPoint]): RDD[Array[Array[Float]]] = {
+  def predict(testSet: RDD[SparkLabeledPoint]): RDD[Array[Array[Float]]] = {
    import DataUtils._
    val broadcastBooster = testSet.sparkContext.broadcast(booster)
    val dataUtils = testSet.sparkContext.broadcast(DataUtils)
    testSet.mapPartitions { testSamples =>
-      val dataBatches = dataUtils.value.fromLabeledPointsToSparseMatrix(testSamples)
+      val dMatrix = new DMatrix(new JDMatrix(testSamples, null))
      val dMatrix = new DMatrix(new JDMatrix(dataBatches, null))
      Iterator(broadcastBooster.value.predict(dMatrix))
    }
  }
  def predict(testSet: DMatrix): Array[Array[Float]] = {
    booster.predict(testSet)
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/test/resources/agaricus.txt.test
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/agaricus.txt.test
--- a/jvm-packages/xgboost4j-spark/src/test/resources/agaricus.txt.train
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/agaricus.txt.train
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@ -0,0 +1,142 @@
 /*
 Copyright (c) 2014 by Contributors
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package ml.dmlc.xgboost4j.scala.spark
 import java.io.File
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
 import scala.tools.reflect.Eval
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError}
 import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
 import org.apache.commons.logging.LogFactory
 import org.apache.spark.mllib.linalg.DenseVector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 class XGBoostSuite extends FunSuite with BeforeAndAfterAll {
  private var sc: SparkContext = null
  private val numWorker = 4
  private class EvalError extends EvalTrait {
    val logger = LogFactory.getLog(classOf[EvalError])
    private[xgboost4j] var evalMetric: String = "custom_error"
    /**
     * get evaluate metric
     *
     * @return evalMetric
     */
    override def getMetric: String = evalMetric
    /**
     * evaluate with predicts and data
     *
     * @param predicts predictions as array
     * @param dmat     data matrix to evaluate
     * @return result of the metric
     */
    override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
      var error: Float = 0f
      var labels: Array[Float] = null
      try {
        labels = dmat.getLabel
      } catch {
        case ex: XGBoostError =>
          logger.error(ex)
          return -1f
      }
      val nrow: Int = predicts.length
      for (i <- 0 until nrow) {
        if (labels(i) == 0.0 && predicts(i)(0) > 0) {
          error += 1
        } else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
          error += 1
        }
      }
      error / labels.length
    }
  }
  override def beforeAll(): Unit = {
    // build SparkContext
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
    sc = new SparkContext(sparkConf)
  }
  override def afterAll(): Unit = {
    if (sc != null) {
      sc.stop()
    }
  }
  private def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
    val labelAndFeatures = line.split(" ")
    val label = labelAndFeatures(0).toInt
    val features = labelAndFeatures.tail
    val denseFeature = new Array[Double](129)
    for (feature <- features) {
      val idAndValue = feature.split(":")
      denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
    }
    LabeledPoint(label, new DenseVector(denseFeature))
  }
  private def readFile(filePath: String): List[LabeledPoint] = {
    val file = Source.fromFile(new File(filePath))
    val sampleList = new ListBuffer[LabeledPoint]
    for (sample <- file.getLines()) {
      sampleList += fromSVMStringToLabeledPoint(sample)
    }
    sampleList.toList
  }
  private def buildRDD(filePath: String): RDD[LabeledPoint] = {
    val sampleList = readFile(filePath)
    sc.parallelize(sampleList, numWorker)
  }
  private def buildTrainingRDD(): RDD[LabeledPoint] = {
    val trainRDD = buildRDD(getClass.getResource("/agaricus.txt.train").getFile)
    trainRDD
  }
  test("build RDD containing boosters") {
    val trainingRDD = buildTrainingRDD()
    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val boosterRDD = XGBoost.buildDistributedBoosters(
      trainingRDD,
      List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
        "objective" -> "binary:logistic").toMap,
      numWorker, 2, null, null)
    val boosterCount = boosterRDD.count()
    assert(boosterCount === numWorker)
    val boosters = boosterRDD.collect()
    for (booster <- boosters) {
      val predicts = booster.predict(testSetDMatrix, true)
      assert(new EvalError().eval(predicts, testSetDMatrix) < 0.1)
    }
  }
 }
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -29,19 +29,6 @@
                    <skipAssembly>false</skipAssembly>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.scalatest</groupId>
                <artifactId>scalatest-maven-plugin</artifactId>
                <version>1.0</version>
                <executions>
                    <execution>
                        <id>test</id>
                        <goals>
                            <goal>test</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
    <dependencies>