From 7cfeb5f0122e76de3581cdcb542dcf2b9202aae9 Mon Sep 17 00:00:00 2001 From: Yoshinori Nakano Date: Fri, 10 Jun 2016 07:28:24 +0900 Subject: [PATCH 1/6] fix Dart::NormalizeTrees (#1265) --- doc/parameter.md | 4 ++-- src/gbm/gbtree.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/parameter.md b/doc/parameter.md index 70575343e..644de6076 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -84,10 +84,10 @@ Additional parameters for Dart Booster * normalize_type [default="tree] - type of normalization algorithm. - "tree": New trees have the same weight of each of dropped trees. - weight of new trees are learning_rate / (k + learnig_rate) + weight of new trees are 1 / (k + learnig_rate) dropped trees are scaled by a factor of k / (k + learning_rate) - "forest": New trees have the same weight of sum of dropped trees (forest). - weight of new trees are learning_rate / (1 + learning_rate) + weight of new trees are 1 / (1 + learning_rate) dropped trees are scaled by a factor of 1 / (1 + learning_rate) * rate_drop [default=0.0] - dropout rate. diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index a48fb2f94..4a6f0b8c6 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -703,7 +703,7 @@ class Dart : public GBTree { weight_drop[i] *= factor; } for (size_t i = 0; i < size_new_trees; ++i) { - weight_drop.push_back(lr * factor); + weight_drop.push_back(factor); } } else { // normalize_type 0 @@ -712,7 +712,7 @@ class Dart : public GBTree { weight_drop[i] *= factor; } for (size_t i = 0; i < size_new_trees; ++i) { - weight_drop.push_back(1.0 * lr / (num_drop + lr)); + weight_drop.push_back(1.0 / (num_drop + lr)); } } } From aaf0a734861dd300429b335dad10442566cf3b2f Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sun, 12 Jun 2016 18:36:36 +0200 Subject: [PATCH 2/6] fixed error when eval False (#1271) --- python-package/xgboost/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py index 3683ea2dd..819b1aba4 100644 --- a/python-package/xgboost/callback.py +++ b/python-package/xgboost/callback.py @@ -38,7 +38,7 @@ def print_evaluation(period=1, show_stdv=True): """ def callback(env): """internal function""" - if env.rank != 0 or len(env.evaluation_result_list) == 0: + if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False: return i = env.iteration if (i % period == 0 or i + 1 == env.begin_iteration): From 661c062bd927ddf9f99452d2dd61f98dbea9fe86 Mon Sep 17 00:00:00 2001 From: catena Date: Sat, 5 Mar 2016 17:35:42 +0530 Subject: [PATCH 3/6] add best_ntreelimit attribute --- R-package/R/xgb.cv.R | 6 +++--- R-package/R/xgb.train.R | 9 +++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index e3faf33a0..c61cdbc5b 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -196,9 +196,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = score <- as.numeric(score) if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) { bestScore <- score - bestInd <- i + bestInd <- i - 1 } else { - if (i - bestInd >= early.stop.round) { + if (i - bestInd > early.stop.round) { earlyStopflag <- TRUE cat('Stopping. Best iteration:', bestInd, '\n') break @@ -211,7 +211,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (k in 1:nfold) { fd <- xgb_folds[[k]] if (!is.null(early.stop.round) && earlyStopflag) { - res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction) + res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd, feval, prediction) } else { res <- xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction) } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 92b5aea10..3868ddf89 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -208,10 +208,10 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), score <- as.numeric(score) if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) { bestScore <- score - bestInd <- i + bestInd <- i - 1 } else { earlyStopflag = TRUE - if (i - bestInd >= early.stop.round) { + if (i - bestInd > early.stop.round) { cat('Stopping. Best iteration:', bestInd, '\n') break } @@ -229,6 +229,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (!is.null(early.stop.round)) { bst$bestScore <- bestScore bst$bestInd <- bestInd + if (!is.null(params$num_parallel_tree)) { + bst$best_ntreelimit <- (bst$bestInd + 1) * params$num_parallel_tree + } else { + bst$best_ntreelimit <- bst$bestInd + 1 + } } attr(bst, "call") <- fit.call From 465e5dfb876394b57a48a7a779a74809db133a2a Mon Sep 17 00:00:00 2001 From: Bill Chambers Date: Mon, 13 Jun 2016 15:41:24 -0700 Subject: [PATCH 4/6] Broken Link in README (#1275) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8b9c10168..f8c53a9c3 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ What's New ---------- * [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages) * [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html) -* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.org/en/latest/tutorial/aws_yarn.html) +* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.io/en/latest/tutorials/aws_yarn.html) * [XGBoost brick](NEWS.md) Release Ask a Question From c9a73fe2a99300aec3041371675a8fa6bc6a8a72 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 15 Jun 2016 16:03:37 -0400 Subject: [PATCH 5/6] explicitly throw exception when detecting empty partition in training dataset (#1281) --- .../dmlc/xgboost4j/scala/spark/XGBoost.scala | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 5903bd2c9..a9e771ae7 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -16,20 +16,16 @@ package ml.dmlc.xgboost4j.scala.spark -import java.nio.file.Paths - -import scala.collection.mutable import scala.collection.JavaConverters._ +import scala.collection.mutable -import org.apache.hadoop.fs.{Path, FileSystem} - +import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit, RabitTracker, XGBoostError} +import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _} import org.apache.commons.logging.LogFactory -import org.apache.spark.{SparkContext, TaskContext} +import org.apache.hadoop.fs.Path import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD - -import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError, Rabit, RabitTracker} -import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _} +import org.apache.spark.{SparkContext, TaskContext} object XGBoost extends Serializable { private val logger = LogFactory.getLog("XGBoostSpark") @@ -58,22 +54,33 @@ object XGBoost extends Serializable { } } val appName = partitionedData.context.appName + // to workaround the empty partitions in training dataset, + // this might not be the best efficient implementation, see + // (https://github.com/dmlc/xgboost/issues/1277) partitionedData.mapPartitions { trainingSamples => rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString) Rabit.init(rabitEnv.asJava) - val cacheFileName: String = { - if (useExternalMemory && trainingSamples.hasNext) { - s"$appName-dtrain_cache-${TaskContext.getPartitionId()}" - } else { - null + var booster: Booster = null + if (trainingSamples.hasNext) { + val cacheFileName: String = { + if (useExternalMemory && trainingSamples.hasNext) { + s"$appName-dtrain_cache-${TaskContext.getPartitionId()}" + } else { + null + } } + val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName)) + booster = SXGBoost.train(trainingSet, xgBoostConfMap, round, + watches = new mutable.HashMap[String, DMatrix] { + put("train", trainingSet) + }.toMap, obj, eval) + Rabit.shutdown() + } else { + Rabit.shutdown() + throw new XGBoostError(s"detect the empty partition in training dataset, partition ID:" + + s" ${TaskContext.getPartitionId().toString}") } - val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName)) - val booster = SXGBoost.train(trainingSet, xgBoostConfMap, round, - watches = new mutable.HashMap[String, DMatrix]{put("train", trainingSet)}.toMap, - obj, eval) - Rabit.shutdown() Iterator(booster) }.cache() } From bd5b07873eace4d529b07b898627f508e3e93039 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Tue, 21 Jun 2016 17:35:17 -0400 Subject: [PATCH 6/6] [jvm-packages] create dmatrix with specified missing value (#1272) * create dmatrix with specified missing value * update dmlc-core * support for predict method in spark package repartitioning work around * add more elements to work around training set empty partition issue --- .../xgboost4j/scala/spark/XGBoostModel.scala | 28 +++++++++++++- .../xgboost4j/scala/spark/XGBoostSuite.scala | 37 ++++++++++++++++++- .../java/ml/dmlc/xgboost4j/java/DMatrix.java | 13 +++++++ .../ml/dmlc/xgboost4j/scala/DMatrix.scala | 13 +++++++ .../ml/dmlc/xgboost4j/java/DMatrixTest.java | 30 +++++++++++++++ .../dmlc/xgboost4j/scala/DMatrixSuite.scala | 24 ++++++++++++ 6 files changed, 143 insertions(+), 2 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala index f81e63048..f97d7b17d 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala @@ -18,7 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{TaskContext, SparkContext} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.linalg.{DenseVector, Vector} import org.apache.spark.rdd.RDD import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix} import ml.dmlc.xgboost4j.scala.{DMatrix, Booster} @@ -27,6 +27,7 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser /** * Predict result with the given testset (represented as RDD) + * * @param testSet test set representd as RDD * @param useExternalCache whether to use external cache for the test set */ @@ -51,6 +52,31 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser } } + /** + * Predict result with the given testset (represented as RDD) + * @param testSet test set representd as RDD + * @param missingValue the specified value to represent the missing value + */ + def predict(testSet: RDD[DenseVector], missingValue: Float): RDD[Array[Array[Float]]] = { + val broadcastBooster = testSet.sparkContext.broadcast(_booster) + testSet.mapPartitions { testSamples => + val sampleArray = testSamples.toList + val numRows = sampleArray.size + val numColumns = sampleArray.head.size + if (numRows == 0) { + Iterator() + } else { + // translate to required format + val flatSampleArray = new Array[Float](numRows * numColumns) + for (i <- flatSampleArray.indices) { + flatSampleArray(i) = sampleArray(i / numColumns).values(i % numColumns).toFloat + } + val dMatrix = new DMatrix(flatSampleArray, numRows, numColumns, missingValue) + Iterator(broadcastBooster.value.predict(dMatrix)) + } + } + } + /** * predict result given the test data (represented as DMatrix) */ diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala index 71bb9ecf8..d17c25b2e 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala @@ -21,6 +21,7 @@ import java.nio.file.Files import scala.collection.mutable.ListBuffer import scala.io.Source +import scala.util.Random import org.apache.commons.logging.LogFactory import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector} @@ -208,7 +209,41 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter { "objective" -> "binary:logistic").toMap val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers) - println(xgBoostModel.predict(testRDD)) + println(xgBoostModel.predict(testRDD).collect()) + } + + test("test with dense vectors containing missing value") { + def buildDenseRDD(): RDD[LabeledPoint] = { + val nrow = 100 + val ncol = 5 + val data0 = Array.ofDim[Double](nrow, ncol) + // put random nums + for (r <- 0 until nrow; c <- 0 until ncol) { + data0(r)(c) = { + if (c == ncol - 1) { + -0.1 + } else { + Random.nextDouble() + } + } + } + // create label + val label0 = new Array[Double](nrow) + for (i <- label0.indices) { + label0(i) = Random.nextDouble() + } + val points = new ListBuffer[LabeledPoint] + for (r <- 0 until nrow) { + points += LabeledPoint(label0(r), Vectors.dense(data0(r))) + } + sc.parallelize(points) + } + val trainingRDD = buildDenseRDD().repartition(4) + val testRDD = buildDenseRDD().repartition(4) + val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0", + "objective" -> "binary:logistic").toMap + val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, 4) + xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect() } test("training with external memory cache") { diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java index 29ee6cf72..8d4db66a1 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java @@ -118,6 +118,19 @@ public class DMatrix { handle = out[0]; } + /** + * create DMatrix from dense matrix + * @param data data values + * @param nrow number of rows + * @param ncol number of columns + * @param missing the specified value to represent the missing value + */ + public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostError { + long[] out = new long[1]; + JNIErrorHandle.checkCall(XGBoostJNI.XGDMatrixCreateFromMat(data, nrow, ncol, missing, out)); + handle = out[0]; + } + /** * used for DMatrix slice */ diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala index 27bcc618c..bb21c3004 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala @@ -67,6 +67,19 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { this(new JDMatrix(data, nrow, ncol)) } + /** + * create DMatrix from dense matrix + * + * @param data data values + * @param nrow number of rows + * @param ncol number of columns + * @param missing the specified value to represent the missing value + */ + @throws(classOf[XGBoostError]) + def this(data: Array[Float], nrow: Int, ncol: Int, missing: Float) { + this(new JDMatrix(data, nrow, ncol, missing)) + } + /** * set label of dmatrix * diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java index d9004b418..0889f5777 100644 --- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java +++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java @@ -125,4 +125,34 @@ public class DMatrixTest { TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight())); } + + @Test + public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError { + //create DMatrix from 10*5 dense matrix + int nrow = 10; + int ncol = 5; + float[] data0 = new float[nrow * ncol]; + //put random nums + Random random = new Random(); + for (int i = 0; i < nrow * ncol; i++) { + if (i % 10 == 0) { + data0[i] = -0.1f; + } else { + data0[i] = random.nextFloat(); + } + } + + //create label + float[] label0 = new float[nrow]; + for (int i = 0; i < nrow; i++) { + label0[i] = random.nextFloat(); + } + + DMatrix dmat0 = new DMatrix(data0, nrow, ncol, -0.1f); + dmat0.setLabel(label0); + + //check + TestCase.assertTrue(dmat0.rowNum() == 10); + TestCase.assertTrue(dmat0.getLabel().length == 10); + } } diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala index e6646b95b..0080b1dd4 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala @@ -82,4 +82,28 @@ class DMatrixSuite extends FunSuite { dmat0.setWeight(weights) assert(weights === dmat0.getWeight) } + + test("create DMatrix from DenseMatrix with missing value") { + val nrow = 10 + val ncol = 5 + val data0 = new Array[Float](nrow * ncol) + // put random nums + for (i <- data0.indices) { + if (i % 10 == 0) { + data0(i) = -0.1f + } else { + data0(i) = Random.nextFloat() + } + } + // create label + val label0 = new Array[Float](nrow) + for (i <- label0.indices) { + label0(i) = Random.nextFloat() + } + val dmat0 = new DMatrix(data0, nrow, ncol, -0.1f) + dmat0.setLabel(label0) + // check + assert(dmat0.rowNum === 10) + assert(dmat0.getLabel.length === 10) + } }