diff --git a/README.md b/README.md index 8b9c10168..f8c53a9c3 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ What's New ---------- * [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages) * [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html) -* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.org/en/latest/tutorial/aws_yarn.html) +* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.io/en/latest/tutorials/aws_yarn.html) * [XGBoost brick](NEWS.md) Release Ask a Question diff --git a/doc/parameter.md b/doc/parameter.md index 70575343e..644de6076 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -84,10 +84,10 @@ Additional parameters for Dart Booster * normalize_type [default="tree] - type of normalization algorithm. - "tree": New trees have the same weight of each of dropped trees. - weight of new trees are learning_rate / (k + learnig_rate) + weight of new trees are 1 / (k + learnig_rate) dropped trees are scaled by a factor of k / (k + learning_rate) - "forest": New trees have the same weight of sum of dropped trees (forest). - weight of new trees are learning_rate / (1 + learning_rate) + weight of new trees are 1 / (1 + learning_rate) dropped trees are scaled by a factor of 1 / (1 + learning_rate) * rate_drop [default=0.0] - dropout rate. diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 5903bd2c9..a9e771ae7 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -16,20 +16,16 @@ package ml.dmlc.xgboost4j.scala.spark -import java.nio.file.Paths - -import scala.collection.mutable import scala.collection.JavaConverters._ +import scala.collection.mutable -import org.apache.hadoop.fs.{Path, FileSystem} - +import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit, RabitTracker, XGBoostError} +import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _} import org.apache.commons.logging.LogFactory -import org.apache.spark.{SparkContext, TaskContext} +import org.apache.hadoop.fs.Path import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD - -import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError, Rabit, RabitTracker} -import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _} +import org.apache.spark.{SparkContext, TaskContext} object XGBoost extends Serializable { private val logger = LogFactory.getLog("XGBoostSpark") @@ -58,22 +54,33 @@ object XGBoost extends Serializable { } } val appName = partitionedData.context.appName + // to workaround the empty partitions in training dataset, + // this might not be the best efficient implementation, see + // (https://github.com/dmlc/xgboost/issues/1277) partitionedData.mapPartitions { trainingSamples => rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString) Rabit.init(rabitEnv.asJava) - val cacheFileName: String = { - if (useExternalMemory && trainingSamples.hasNext) { - s"$appName-dtrain_cache-${TaskContext.getPartitionId()}" - } else { - null + var booster: Booster = null + if (trainingSamples.hasNext) { + val cacheFileName: String = { + if (useExternalMemory && trainingSamples.hasNext) { + s"$appName-dtrain_cache-${TaskContext.getPartitionId()}" + } else { + null + } } + val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName)) + booster = SXGBoost.train(trainingSet, xgBoostConfMap, round, + watches = new mutable.HashMap[String, DMatrix] { + put("train", trainingSet) + }.toMap, obj, eval) + Rabit.shutdown() + } else { + Rabit.shutdown() + throw new XGBoostError(s"detect the empty partition in training dataset, partition ID:" + + s" ${TaskContext.getPartitionId().toString}") } - val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName)) - val booster = SXGBoost.train(trainingSet, xgBoostConfMap, round, - watches = new mutable.HashMap[String, DMatrix]{put("train", trainingSet)}.toMap, - obj, eval) - Rabit.shutdown() Iterator(booster) }.cache() } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala index f81e63048..f97d7b17d 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala @@ -18,7 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{TaskContext, SparkContext} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.linalg.{DenseVector, Vector} import org.apache.spark.rdd.RDD import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix} import ml.dmlc.xgboost4j.scala.{DMatrix, Booster} @@ -27,6 +27,7 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser /** * Predict result with the given testset (represented as RDD) + * * @param testSet test set representd as RDD * @param useExternalCache whether to use external cache for the test set */ @@ -51,6 +52,31 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser } } + /** + * Predict result with the given testset (represented as RDD) + * @param testSet test set representd as RDD + * @param missingValue the specified value to represent the missing value + */ + def predict(testSet: RDD[DenseVector], missingValue: Float): RDD[Array[Array[Float]]] = { + val broadcastBooster = testSet.sparkContext.broadcast(_booster) + testSet.mapPartitions { testSamples => + val sampleArray = testSamples.toList + val numRows = sampleArray.size + val numColumns = sampleArray.head.size + if (numRows == 0) { + Iterator() + } else { + // translate to required format + val flatSampleArray = new Array[Float](numRows * numColumns) + for (i <- flatSampleArray.indices) { + flatSampleArray(i) = sampleArray(i / numColumns).values(i % numColumns).toFloat + } + val dMatrix = new DMatrix(flatSampleArray, numRows, numColumns, missingValue) + Iterator(broadcastBooster.value.predict(dMatrix)) + } + } + } + /** * predict result given the test data (represented as DMatrix) */ diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala index 71bb9ecf8..d17c25b2e 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala @@ -21,6 +21,7 @@ import java.nio.file.Files import scala.collection.mutable.ListBuffer import scala.io.Source +import scala.util.Random import org.apache.commons.logging.LogFactory import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector} @@ -208,7 +209,41 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter { "objective" -> "binary:logistic").toMap val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers) - println(xgBoostModel.predict(testRDD)) + println(xgBoostModel.predict(testRDD).collect()) + } + + test("test with dense vectors containing missing value") { + def buildDenseRDD(): RDD[LabeledPoint] = { + val nrow = 100 + val ncol = 5 + val data0 = Array.ofDim[Double](nrow, ncol) + // put random nums + for (r <- 0 until nrow; c <- 0 until ncol) { + data0(r)(c) = { + if (c == ncol - 1) { + -0.1 + } else { + Random.nextDouble() + } + } + } + // create label + val label0 = new Array[Double](nrow) + for (i <- label0.indices) { + label0(i) = Random.nextDouble() + } + val points = new ListBuffer[LabeledPoint] + for (r <- 0 until nrow) { + points += LabeledPoint(label0(r), Vectors.dense(data0(r))) + } + sc.parallelize(points) + } + val trainingRDD = buildDenseRDD().repartition(4) + val testRDD = buildDenseRDD().repartition(4) + val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0", + "objective" -> "binary:logistic").toMap + val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, 4) + xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect() } test("training with external memory cache") { diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java index 29ee6cf72..8d4db66a1 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java @@ -118,6 +118,19 @@ public class DMatrix { handle = out[0]; } + /** + * create DMatrix from dense matrix + * @param data data values + * @param nrow number of rows + * @param ncol number of columns + * @param missing the specified value to represent the missing value + */ + public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostError { + long[] out = new long[1]; + JNIErrorHandle.checkCall(XGBoostJNI.XGDMatrixCreateFromMat(data, nrow, ncol, missing, out)); + handle = out[0]; + } + /** * used for DMatrix slice */ diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala index 27bcc618c..bb21c3004 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala @@ -67,6 +67,19 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { this(new JDMatrix(data, nrow, ncol)) } + /** + * create DMatrix from dense matrix + * + * @param data data values + * @param nrow number of rows + * @param ncol number of columns + * @param missing the specified value to represent the missing value + */ + @throws(classOf[XGBoostError]) + def this(data: Array[Float], nrow: Int, ncol: Int, missing: Float) { + this(new JDMatrix(data, nrow, ncol, missing)) + } + /** * set label of dmatrix * diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java index d9004b418..0889f5777 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java @@ -125,4 +125,34 @@ public class DMatrixTest { TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight())); } + + @Test + public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError { + //create DMatrix from 10*5 dense matrix + int nrow = 10; + int ncol = 5; + float[] data0 = new float[nrow * ncol]; + //put random nums + Random random = new Random(); + for (int i = 0; i < nrow * ncol; i++) { + if (i % 10 == 0) { + data0[i] = -0.1f; + } else { + data0[i] = random.nextFloat(); + } + } + + //create label + float[] label0 = new float[nrow]; + for (int i = 0; i < nrow; i++) { + label0[i] = random.nextFloat(); + } + + DMatrix dmat0 = new DMatrix(data0, nrow, ncol, -0.1f); + dmat0.setLabel(label0); + + //check + TestCase.assertTrue(dmat0.rowNum() == 10); + TestCase.assertTrue(dmat0.getLabel().length == 10); + } } diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala index e6646b95b..0080b1dd4 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala @@ -82,4 +82,28 @@ class DMatrixSuite extends FunSuite { dmat0.setWeight(weights) assert(weights === dmat0.getWeight) } + + test("create DMatrix from DenseMatrix with missing value") { + val nrow = 10 + val ncol = 5 + val data0 = new Array[Float](nrow * ncol) + // put random nums + for (i <- data0.indices) { + if (i % 10 == 0) { + data0(i) = -0.1f + } else { + data0(i) = Random.nextFloat() + } + } + // create label + val label0 = new Array[Float](nrow) + for (i <- label0.indices) { + label0(i) = Random.nextFloat() + } + val dmat0 = new DMatrix(data0, nrow, ncol, -0.1f) + dmat0.setLabel(label0) + // check + assert(dmat0.rowNum === 10) + assert(dmat0.getLabel.length === 10) + } } diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py index 3683ea2dd..819b1aba4 100644 --- a/python-package/xgboost/callback.py +++ b/python-package/xgboost/callback.py @@ -38,7 +38,7 @@ def print_evaluation(period=1, show_stdv=True): """ def callback(env): """internal function""" - if env.rank != 0 or len(env.evaluation_result_list) == 0: + if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False: return i = env.iteration if (i % period == 0 or i + 1 == env.begin_iteration): diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index a48fb2f94..4a6f0b8c6 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -703,7 +703,7 @@ class Dart : public GBTree { weight_drop[i] *= factor; } for (size_t i = 0; i < size_new_trees; ++i) { - weight_drop.push_back(lr * factor); + weight_drop.push_back(factor); } } else { // normalize_type 0 @@ -712,7 +712,7 @@ class Dart : public GBTree { weight_drop[i] *= factor; } for (size_t i = 0; i < size_new_trees; ++i) { - weight_drop.push_back(1.0 * lr / (num_drop + lr)); + weight_drop.push_back(1.0 / (num_drop + lr)); } } }