fix merge conflicts

2016-06-27 02:18:59 -05:00
parent e1a52e896c bd5b07873e
commit 4b2eedc186
11 changed files with 175 additions and 27 deletions
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ What's New
 ----------
 * [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages)
 * [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html)
-* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.org/en/latest/tutorial/aws_yarn.html)
+* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.io/en/latest/tutorials/aws_yarn.html)
 * [XGBoost brick](NEWS.md) Release

 Ask a Question
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -84,10 +84,10 @@ Additional parameters for Dart Booster
 * normalize_type [default="tree]
  - type of normalization algorithm.
    - "tree": New trees have the same weight of each of dropped trees.
-              weight of new trees are learning_rate / (k + learnig_rate)
+              weight of new trees are 1 / (k + learnig_rate)
              dropped trees are scaled by a factor of k / (k + learning_rate)
    - "forest": New trees have the same weight of sum of dropped trees (forest).
-                weight of new trees are learning_rate / (1 + learning_rate)
+                weight of new trees are 1 / (1 + learning_rate)
                dropped trees are scaled by a factor of 1 / (1 + learning_rate)
 * rate_drop [default=0.0]
  - dropout rate.
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -16,20 +16,16 @@

 package ml.dmlc.xgboost4j.scala.spark

-import java.nio.file.Paths
-
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable

-import org.apache.hadoop.fs.{Path, FileSystem}
-
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit, RabitTracker, XGBoostError}
+import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import org.apache.commons.logging.LogFactory
-import org.apache.spark.{SparkContext, TaskContext}
+import org.apache.hadoop.fs.Path
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-
-import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError, Rabit, RabitTracker}
-import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
+import org.apache.spark.{SparkContext, TaskContext}

 object XGBoost extends Serializable {
  private val logger = LogFactory.getLog("XGBoostSpark")
@@ -58,22 +54,33 @@ object XGBoost extends Serializable {
      }
    }
    val appName = partitionedData.context.appName
+    // to workaround the empty partitions in training dataset,
+    // this might not be the best efficient implementation, see
+    // (https://github.com/dmlc/xgboost/issues/1277)
    partitionedData.mapPartitions {
      trainingSamples =>
        rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
        Rabit.init(rabitEnv.asJava)
-        val cacheFileName: String = {
-          if (useExternalMemory && trainingSamples.hasNext) {
-            s"$appName-dtrain_cache-${TaskContext.getPartitionId()}"
-          } else {
-            null
+        var booster: Booster = null
+        if (trainingSamples.hasNext) {
+          val cacheFileName: String = {
+            if (useExternalMemory && trainingSamples.hasNext) {
+              s"$appName-dtrain_cache-${TaskContext.getPartitionId()}"
+            } else {
+              null
+            }
          }
+          val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName))
+          booster = SXGBoost.train(trainingSet, xgBoostConfMap, round,
+            watches = new mutable.HashMap[String, DMatrix] {
+              put("train", trainingSet)
+            }.toMap, obj, eval)
+          Rabit.shutdown()
+        } else {
+          Rabit.shutdown()
+          throw new XGBoostError(s"detect the empty partition in training dataset, partition ID:" +
+            s" ${TaskContext.getPartitionId().toString}")
        }
-        val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName))
-        val booster = SXGBoost.train(trainingSet, xgBoostConfMap, round,
-          watches = new mutable.HashMap[String, DMatrix]{put("train", trainingSet)}.toMap,
-            obj, eval)
-        Rabit.shutdown()
        Iterator(booster)
    }.cache()
  }
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
@@ -18,7 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark

 import org.apache.hadoop.fs.{Path, FileSystem}
 import org.apache.spark.{TaskContext, SparkContext}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{DenseVector, Vector}
 import org.apache.spark.rdd.RDD
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 import ml.dmlc.xgboost4j.scala.{DMatrix, Booster}
@@ -27,6 +27,7 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser

  /**
   * Predict result with the given testset (represented as RDD)
+   *
   * @param testSet test set representd as RDD
   * @param useExternalCache whether to use external cache for the test set
   */
@@ -51,6 +52,31 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser
    }
  }

+  /**
+   * Predict result with the given testset (represented as RDD)
+   * @param testSet test set representd as RDD
+   * @param missingValue the specified value to represent the missing value
+   */
+  def predict(testSet: RDD[DenseVector], missingValue: Float): RDD[Array[Array[Float]]] = {
+    val broadcastBooster = testSet.sparkContext.broadcast(_booster)
+    testSet.mapPartitions { testSamples =>
+      val sampleArray = testSamples.toList
+      val numRows = sampleArray.size
+      val numColumns = sampleArray.head.size
+      if (numRows == 0) {
+        Iterator()
+      } else {
+        // translate to required format
+        val flatSampleArray = new Array[Float](numRows * numColumns)
+        for (i <- flatSampleArray.indices) {
+          flatSampleArray(i) = sampleArray(i / numColumns).values(i % numColumns).toFloat
+        }
+        val dMatrix = new DMatrix(flatSampleArray, numRows, numColumns, missingValue)
+        Iterator(broadcastBooster.value.predict(dMatrix))
+      }
+    }
+  }
+
  /**
   * predict result given the test data (represented as DMatrix)
   */
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@@ -21,6 +21,7 @@ import java.nio.file.Files

 import scala.collection.mutable.ListBuffer
 import scala.io.Source
+import scala.util.Random

 import org.apache.commons.logging.LogFactory
 import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
@@ -208,7 +209,41 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
      "objective" -> "binary:logistic").toMap
    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)

-    println(xgBoostModel.predict(testRDD))
+    println(xgBoostModel.predict(testRDD).collect())
+  }
+
+  test("test with dense vectors containing missing value") {
+    def buildDenseRDD(): RDD[LabeledPoint] = {
+      val nrow = 100
+      val ncol = 5
+      val data0 = Array.ofDim[Double](nrow, ncol)
+      // put random nums
+      for (r <- 0 until nrow; c <- 0 until ncol) {
+        data0(r)(c) = {
+          if (c == ncol - 1) {
+            -0.1
+          } else {
+            Random.nextDouble()
+          }
+        }
+      }
+      // create label
+      val label0 = new Array[Double](nrow)
+      for (i <- label0.indices) {
+        label0(i) = Random.nextDouble()
+      }
+      val points = new ListBuffer[LabeledPoint]
+      for (r <- 0 until nrow) {
+        points += LabeledPoint(label0(r), Vectors.dense(data0(r)))
+      }
+      sc.parallelize(points)
+    }
+    val trainingRDD = buildDenseRDD().repartition(4)
+    val testRDD = buildDenseRDD().repartition(4)
+    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, 4)
+    xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
  }

  test("training with external memory cache") {
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java
@@ -118,6 +118,19 @@ public class DMatrix {
    handle = out[0];
  }

+  /**
+   * create DMatrix from dense matrix
+   * @param data data values
+   * @param nrow number of rows
+   * @param ncol number of columns
+   * @param missing the specified value to represent the missing value
+   */
+  public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostError {
+    long[] out = new long[1];
+    JNIErrorHandle.checkCall(XGBoostJNI.XGDMatrixCreateFromMat(data, nrow, ncol, missing, out));
+    handle = out[0];
+  }
+
  /**
   * used for DMatrix slice
   */
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
@@ -67,6 +67,19 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
    this(new JDMatrix(data, nrow, ncol))
  }

+  /**
+   * create DMatrix from dense matrix
+   *
+   * @param data data values
+   * @param nrow number of rows
+   * @param ncol number of columns
+   * @param missing the specified value to represent the missing value
+   */
+  @throws(classOf[XGBoostError])
+  def this(data: Array[Float], nrow: Int, ncol: Int, missing: Float) {
+    this(new JDMatrix(data, nrow, ncol, missing))
+  }
+
  /**
   * set label of dmatrix
   *
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -125,4 +125,34 @@ public class DMatrixTest {

    TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight()));
  }
+
+  @Test
+  public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError {
+    //create DMatrix from 10*5 dense matrix
+    int nrow = 10;
+    int ncol = 5;
+    float[] data0 = new float[nrow * ncol];
+    //put random nums
+    Random random = new Random();
+    for (int i = 0; i < nrow * ncol; i++) {
+      if (i % 10 == 0) {
+        data0[i] = -0.1f;
+      } else {
+        data0[i] = random.nextFloat();
+      }
+    }
+
+    //create label
+    float[] label0 = new float[nrow];
+    for (int i = 0; i < nrow; i++) {
+      label0[i] = random.nextFloat();
+    }
+
+    DMatrix dmat0 = new DMatrix(data0, nrow, ncol, -0.1f);
+    dmat0.setLabel(label0);
+
+    //check
+    TestCase.assertTrue(dmat0.rowNum() == 10);
+    TestCase.assertTrue(dmat0.getLabel().length == 10);
+  }
 }
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
@@ -82,4 +82,28 @@ class DMatrixSuite extends FunSuite {
    dmat0.setWeight(weights)
    assert(weights === dmat0.getWeight)
  }
+
+  test("create DMatrix from DenseMatrix with missing value") {
+    val nrow = 10
+    val ncol = 5
+    val data0 = new Array[Float](nrow * ncol)
+    // put random nums
+    for (i <- data0.indices) {
+      if (i % 10 == 0) {
+        data0(i) = -0.1f
+      } else {
+        data0(i) = Random.nextFloat()
+      }
+    }
+    // create label
+    val label0 = new Array[Float](nrow)
+    for (i <- label0.indices) {
+      label0(i) = Random.nextFloat()
+    }
+    val dmat0 = new DMatrix(data0, nrow, ncol, -0.1f)
+    dmat0.setLabel(label0)
+    // check
+    assert(dmat0.rowNum === 10)
+    assert(dmat0.getLabel.length === 10)
+  }
 }
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -38,7 +38,7 @@ def print_evaluation(period=1, show_stdv=True):
    """
    def callback(env):
        """internal function"""
-        if env.rank != 0 or len(env.evaluation_result_list) == 0:
+        if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False:
            return
        i = env.iteration
        if (i % period == 0 or i + 1 == env.begin_iteration):
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -703,7 +703,7 @@ class Dart : public GBTree {
          weight_drop[i] *= factor;
        }
        for (size_t i = 0; i < size_new_trees; ++i) {
-          weight_drop.push_back(lr * factor);
+          weight_drop.push_back(factor);
        }
      } else {
        // normalize_type 0
@@ -712,7 +712,7 @@ class Dart : public GBTree {
          weight_drop[i] *= factor;
        }
        for (size_t i = 0; i < size_new_trees; ++i) {
-          weight_drop.push_back(1.0 * lr / (num_drop + lr));
+          weight_drop.push_back(1.0 / (num_drop + lr));
        }
      }
    }