From 5efc1ee3a47062fd5f2924fa023b46094bb37049 Mon Sep 17 00:00:00 2001 From: Andrew Smith Date: Tue, 22 Mar 2016 12:54:18 +0000 Subject: [PATCH 1/3] Fixed typos. --- R-package/demo/basic_walkthrough.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 193618be3..ece168a04 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -64,8 +64,8 @@ raw = xgb.save.raw(bst) # load binary model to R bst3 <- xgb.load(raw) pred3 <- predict(bst3, test$data) -# pred2 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred)))) +# pred3 should be identical to pred +print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred)))) #----------------Advanced features -------------- # to use advanced features, we need to put data in xgb.DMatrix From bbb9ce1641a92c13948754a7949751b05ebfa068 Mon Sep 17 00:00:00 2001 From: Julian Quick Date: Tue, 22 Mar 2016 14:13:29 -0600 Subject: [PATCH 2/3] Verbose message: which fields have impropper data types A more verbose error message letting the user know which fields have impropper data types --- python-package/xgboost/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 971d0a95b..80d171326 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -144,7 +144,8 @@ def _maybe_pandas_data(data, feature_names, feature_types): data_dtypes = data.dtypes if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): - raise ValueError('DataFrame.dtypes for data must be int, float or bool') + bad_fields = [data.columns[i] for i, dtype in enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER ] + raise ValueError('DataFrame.dtypes for data must be int, float or bool.\nDid not expect the data types in fie lds '+', '.join(bad_fields)) if feature_names is None: feature_names = data.columns.format() From d8535313eb96c65a4c29708dbf02aa71cb8bb7f9 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 23 Mar 2016 12:30:06 -0400 Subject: [PATCH 3/3] allow empty partitions --- .../xgboost4j/scala/spark/XGBoostModel.scala | 8 +++++-- .../xgboost4j/scala/spark/XGBoostSuite.scala | 22 ++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala index 14fa3c0f6..75a91e64c 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala @@ -32,8 +32,12 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser import DataUtils._ val broadcastBooster = testSet.sparkContext.broadcast(_booster) testSet.mapPartitions { testSamples => - val dMatrix = new DMatrix(new JDMatrix(testSamples, null)) - Iterator(broadcastBooster.value.predict(dMatrix)) + if (testSamples.hasNext) { + val dMatrix = new DMatrix(new JDMatrix(testSamples, null)) + Iterator(broadcastBooster.value.predict(dMatrix)) + } else { + Iterator() + } } } diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala index 6f4e98aa3..711ea35f0 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.ListBuffer import scala.io.Source import org.apache.commons.logging.LogFactory -import org.apache.spark.mllib.linalg.DenseVector +import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} @@ -190,4 +190,24 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter { assert(eval.eval(xgBoostModel.predict(testSetDMatrix), testSetDMatrix) < 0.1) customSparkContext.stop() } + + test("test with empty partition") { + + def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = { + val sampleList = new ListBuffer[SparkVector] + sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers) + } + + val eval = new EvalError() + val trainingRDD = buildTrainingRDD() + val testRDD = buildEmptyRDD() + import DataUtils._ + val tempDir = Files.createTempDirectory("xgboosttest-") + val tempFile = Files.createTempFile(tempDir, "", "") + val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0", + "objective" -> "binary:logistic").toMap + val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers) + + println(xgBoostModel.predict(testRDD)) + } }