fix merge conflicts
This commit is contained in:
commit
4b2eedc186
@ -22,7 +22,7 @@ What's New
|
|||||||
----------
|
----------
|
||||||
* [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages)
|
* [XGBoost4J: Portable Distributed XGboost in Spark, Flink and Dataflow](http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html), see [JVM-Package](https://github.com/dmlc/xgboost/tree/master/jvm-packages)
|
||||||
* [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html)
|
* [Story and Lessons Behind the Evolution of XGBoost](http://homes.cs.washington.edu/~tqchen/2016/03/10/story-and-lessons-behind-the-evolution-of-xgboost.html)
|
||||||
* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.org/en/latest/tutorial/aws_yarn.html)
|
* [Tutorial: Distributed XGBoost on AWS with YARN](https://xgboost.readthedocs.io/en/latest/tutorials/aws_yarn.html)
|
||||||
* [XGBoost brick](NEWS.md) Release
|
* [XGBoost brick](NEWS.md) Release
|
||||||
|
|
||||||
Ask a Question
|
Ask a Question
|
||||||
|
|||||||
@ -84,10 +84,10 @@ Additional parameters for Dart Booster
|
|||||||
* normalize_type [default="tree]
|
* normalize_type [default="tree]
|
||||||
- type of normalization algorithm.
|
- type of normalization algorithm.
|
||||||
- "tree": New trees have the same weight of each of dropped trees.
|
- "tree": New trees have the same weight of each of dropped trees.
|
||||||
weight of new trees are learning_rate / (k + learnig_rate)
|
weight of new trees are 1 / (k + learnig_rate)
|
||||||
dropped trees are scaled by a factor of k / (k + learning_rate)
|
dropped trees are scaled by a factor of k / (k + learning_rate)
|
||||||
- "forest": New trees have the same weight of sum of dropped trees (forest).
|
- "forest": New trees have the same weight of sum of dropped trees (forest).
|
||||||
weight of new trees are learning_rate / (1 + learning_rate)
|
weight of new trees are 1 / (1 + learning_rate)
|
||||||
dropped trees are scaled by a factor of 1 / (1 + learning_rate)
|
dropped trees are scaled by a factor of 1 / (1 + learning_rate)
|
||||||
* rate_drop [default=0.0]
|
* rate_drop [default=0.0]
|
||||||
- dropout rate.
|
- dropout rate.
|
||||||
|
|||||||
@ -16,20 +16,16 @@
|
|||||||
|
|
||||||
package ml.dmlc.xgboost4j.scala.spark
|
package ml.dmlc.xgboost4j.scala.spark
|
||||||
|
|
||||||
import java.nio.file.Paths
|
|
||||||
|
|
||||||
import scala.collection.mutable
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.collection.mutable
|
||||||
|
|
||||||
import org.apache.hadoop.fs.{Path, FileSystem}
|
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit, RabitTracker, XGBoostError}
|
||||||
|
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||||
import org.apache.commons.logging.LogFactory
|
import org.apache.commons.logging.LogFactory
|
||||||
import org.apache.spark.{SparkContext, TaskContext}
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.spark.mllib.regression.LabeledPoint
|
import org.apache.spark.mllib.regression.LabeledPoint
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.{SparkContext, TaskContext}
|
||||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError, Rabit, RabitTracker}
|
|
||||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
|
||||||
|
|
||||||
object XGBoost extends Serializable {
|
object XGBoost extends Serializable {
|
||||||
private val logger = LogFactory.getLog("XGBoostSpark")
|
private val logger = LogFactory.getLog("XGBoostSpark")
|
||||||
@ -58,22 +54,33 @@ object XGBoost extends Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
val appName = partitionedData.context.appName
|
val appName = partitionedData.context.appName
|
||||||
|
// to workaround the empty partitions in training dataset,
|
||||||
|
// this might not be the best efficient implementation, see
|
||||||
|
// (https://github.com/dmlc/xgboost/issues/1277)
|
||||||
partitionedData.mapPartitions {
|
partitionedData.mapPartitions {
|
||||||
trainingSamples =>
|
trainingSamples =>
|
||||||
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
|
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
|
||||||
Rabit.init(rabitEnv.asJava)
|
Rabit.init(rabitEnv.asJava)
|
||||||
val cacheFileName: String = {
|
var booster: Booster = null
|
||||||
if (useExternalMemory && trainingSamples.hasNext) {
|
if (trainingSamples.hasNext) {
|
||||||
s"$appName-dtrain_cache-${TaskContext.getPartitionId()}"
|
val cacheFileName: String = {
|
||||||
} else {
|
if (useExternalMemory && trainingSamples.hasNext) {
|
||||||
null
|
s"$appName-dtrain_cache-${TaskContext.getPartitionId()}"
|
||||||
|
} else {
|
||||||
|
null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName))
|
||||||
|
booster = SXGBoost.train(trainingSet, xgBoostConfMap, round,
|
||||||
|
watches = new mutable.HashMap[String, DMatrix] {
|
||||||
|
put("train", trainingSet)
|
||||||
|
}.toMap, obj, eval)
|
||||||
|
Rabit.shutdown()
|
||||||
|
} else {
|
||||||
|
Rabit.shutdown()
|
||||||
|
throw new XGBoostError(s"detect the empty partition in training dataset, partition ID:" +
|
||||||
|
s" ${TaskContext.getPartitionId().toString}")
|
||||||
}
|
}
|
||||||
val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName))
|
|
||||||
val booster = SXGBoost.train(trainingSet, xgBoostConfMap, round,
|
|
||||||
watches = new mutable.HashMap[String, DMatrix]{put("train", trainingSet)}.toMap,
|
|
||||||
obj, eval)
|
|
||||||
Rabit.shutdown()
|
|
||||||
Iterator(booster)
|
Iterator(booster)
|
||||||
}.cache()
|
}.cache()
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,7 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark
|
|||||||
|
|
||||||
import org.apache.hadoop.fs.{Path, FileSystem}
|
import org.apache.hadoop.fs.{Path, FileSystem}
|
||||||
import org.apache.spark.{TaskContext, SparkContext}
|
import org.apache.spark.{TaskContext, SparkContext}
|
||||||
import org.apache.spark.mllib.linalg.Vector
|
import org.apache.spark.mllib.linalg.{DenseVector, Vector}
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
|
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
|
||||||
import ml.dmlc.xgboost4j.scala.{DMatrix, Booster}
|
import ml.dmlc.xgboost4j.scala.{DMatrix, Booster}
|
||||||
@ -27,6 +27,7 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Predict result with the given testset (represented as RDD)
|
* Predict result with the given testset (represented as RDD)
|
||||||
|
*
|
||||||
* @param testSet test set representd as RDD
|
* @param testSet test set representd as RDD
|
||||||
* @param useExternalCache whether to use external cache for the test set
|
* @param useExternalCache whether to use external cache for the test set
|
||||||
*/
|
*/
|
||||||
@ -51,6 +52,31 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Predict result with the given testset (represented as RDD)
|
||||||
|
* @param testSet test set representd as RDD
|
||||||
|
* @param missingValue the specified value to represent the missing value
|
||||||
|
*/
|
||||||
|
def predict(testSet: RDD[DenseVector], missingValue: Float): RDD[Array[Array[Float]]] = {
|
||||||
|
val broadcastBooster = testSet.sparkContext.broadcast(_booster)
|
||||||
|
testSet.mapPartitions { testSamples =>
|
||||||
|
val sampleArray = testSamples.toList
|
||||||
|
val numRows = sampleArray.size
|
||||||
|
val numColumns = sampleArray.head.size
|
||||||
|
if (numRows == 0) {
|
||||||
|
Iterator()
|
||||||
|
} else {
|
||||||
|
// translate to required format
|
||||||
|
val flatSampleArray = new Array[Float](numRows * numColumns)
|
||||||
|
for (i <- flatSampleArray.indices) {
|
||||||
|
flatSampleArray(i) = sampleArray(i / numColumns).values(i % numColumns).toFloat
|
||||||
|
}
|
||||||
|
val dMatrix = new DMatrix(flatSampleArray, numRows, numColumns, missingValue)
|
||||||
|
Iterator(broadcastBooster.value.predict(dMatrix))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* predict result given the test data (represented as DMatrix)
|
* predict result given the test data (represented as DMatrix)
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -21,6 +21,7 @@ import java.nio.file.Files
|
|||||||
|
|
||||||
import scala.collection.mutable.ListBuffer
|
import scala.collection.mutable.ListBuffer
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
import scala.util.Random
|
||||||
|
|
||||||
import org.apache.commons.logging.LogFactory
|
import org.apache.commons.logging.LogFactory
|
||||||
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
|
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
|
||||||
@ -208,7 +209,41 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
|
|||||||
"objective" -> "binary:logistic").toMap
|
"objective" -> "binary:logistic").toMap
|
||||||
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
|
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
|
||||||
|
|
||||||
println(xgBoostModel.predict(testRDD))
|
println(xgBoostModel.predict(testRDD).collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
test("test with dense vectors containing missing value") {
|
||||||
|
def buildDenseRDD(): RDD[LabeledPoint] = {
|
||||||
|
val nrow = 100
|
||||||
|
val ncol = 5
|
||||||
|
val data0 = Array.ofDim[Double](nrow, ncol)
|
||||||
|
// put random nums
|
||||||
|
for (r <- 0 until nrow; c <- 0 until ncol) {
|
||||||
|
data0(r)(c) = {
|
||||||
|
if (c == ncol - 1) {
|
||||||
|
-0.1
|
||||||
|
} else {
|
||||||
|
Random.nextDouble()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// create label
|
||||||
|
val label0 = new Array[Double](nrow)
|
||||||
|
for (i <- label0.indices) {
|
||||||
|
label0(i) = Random.nextDouble()
|
||||||
|
}
|
||||||
|
val points = new ListBuffer[LabeledPoint]
|
||||||
|
for (r <- 0 until nrow) {
|
||||||
|
points += LabeledPoint(label0(r), Vectors.dense(data0(r)))
|
||||||
|
}
|
||||||
|
sc.parallelize(points)
|
||||||
|
}
|
||||||
|
val trainingRDD = buildDenseRDD().repartition(4)
|
||||||
|
val testRDD = buildDenseRDD().repartition(4)
|
||||||
|
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||||
|
"objective" -> "binary:logistic").toMap
|
||||||
|
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, 4)
|
||||||
|
xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
test("training with external memory cache") {
|
test("training with external memory cache") {
|
||||||
|
|||||||
@ -118,6 +118,19 @@ public class DMatrix {
|
|||||||
handle = out[0];
|
handle = out[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create DMatrix from dense matrix
|
||||||
|
* @param data data values
|
||||||
|
* @param nrow number of rows
|
||||||
|
* @param ncol number of columns
|
||||||
|
* @param missing the specified value to represent the missing value
|
||||||
|
*/
|
||||||
|
public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostError {
|
||||||
|
long[] out = new long[1];
|
||||||
|
JNIErrorHandle.checkCall(XGBoostJNI.XGDMatrixCreateFromMat(data, nrow, ncol, missing, out));
|
||||||
|
handle = out[0];
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* used for DMatrix slice
|
* used for DMatrix slice
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -67,6 +67,19 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
|
|||||||
this(new JDMatrix(data, nrow, ncol))
|
this(new JDMatrix(data, nrow, ncol))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create DMatrix from dense matrix
|
||||||
|
*
|
||||||
|
* @param data data values
|
||||||
|
* @param nrow number of rows
|
||||||
|
* @param ncol number of columns
|
||||||
|
* @param missing the specified value to represent the missing value
|
||||||
|
*/
|
||||||
|
@throws(classOf[XGBoostError])
|
||||||
|
def this(data: Array[Float], nrow: Int, ncol: Int, missing: Float) {
|
||||||
|
this(new JDMatrix(data, nrow, ncol, missing))
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set label of dmatrix
|
* set label of dmatrix
|
||||||
*
|
*
|
||||||
|
|||||||
@ -125,4 +125,34 @@ public class DMatrixTest {
|
|||||||
|
|
||||||
TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight()));
|
TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError {
|
||||||
|
//create DMatrix from 10*5 dense matrix
|
||||||
|
int nrow = 10;
|
||||||
|
int ncol = 5;
|
||||||
|
float[] data0 = new float[nrow * ncol];
|
||||||
|
//put random nums
|
||||||
|
Random random = new Random();
|
||||||
|
for (int i = 0; i < nrow * ncol; i++) {
|
||||||
|
if (i % 10 == 0) {
|
||||||
|
data0[i] = -0.1f;
|
||||||
|
} else {
|
||||||
|
data0[i] = random.nextFloat();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//create label
|
||||||
|
float[] label0 = new float[nrow];
|
||||||
|
for (int i = 0; i < nrow; i++) {
|
||||||
|
label0[i] = random.nextFloat();
|
||||||
|
}
|
||||||
|
|
||||||
|
DMatrix dmat0 = new DMatrix(data0, nrow, ncol, -0.1f);
|
||||||
|
dmat0.setLabel(label0);
|
||||||
|
|
||||||
|
//check
|
||||||
|
TestCase.assertTrue(dmat0.rowNum() == 10);
|
||||||
|
TestCase.assertTrue(dmat0.getLabel().length == 10);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -82,4 +82,28 @@ class DMatrixSuite extends FunSuite {
|
|||||||
dmat0.setWeight(weights)
|
dmat0.setWeight(weights)
|
||||||
assert(weights === dmat0.getWeight)
|
assert(weights === dmat0.getWeight)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("create DMatrix from DenseMatrix with missing value") {
|
||||||
|
val nrow = 10
|
||||||
|
val ncol = 5
|
||||||
|
val data0 = new Array[Float](nrow * ncol)
|
||||||
|
// put random nums
|
||||||
|
for (i <- data0.indices) {
|
||||||
|
if (i % 10 == 0) {
|
||||||
|
data0(i) = -0.1f
|
||||||
|
} else {
|
||||||
|
data0(i) = Random.nextFloat()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// create label
|
||||||
|
val label0 = new Array[Float](nrow)
|
||||||
|
for (i <- label0.indices) {
|
||||||
|
label0(i) = Random.nextFloat()
|
||||||
|
}
|
||||||
|
val dmat0 = new DMatrix(data0, nrow, ncol, -0.1f)
|
||||||
|
dmat0.setLabel(label0)
|
||||||
|
// check
|
||||||
|
assert(dmat0.rowNum === 10)
|
||||||
|
assert(dmat0.getLabel.length === 10)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def print_evaluation(period=1, show_stdv=True):
|
|||||||
"""
|
"""
|
||||||
def callback(env):
|
def callback(env):
|
||||||
"""internal function"""
|
"""internal function"""
|
||||||
if env.rank != 0 or len(env.evaluation_result_list) == 0:
|
if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False:
|
||||||
return
|
return
|
||||||
i = env.iteration
|
i = env.iteration
|
||||||
if (i % period == 0 or i + 1 == env.begin_iteration):
|
if (i % period == 0 or i + 1 == env.begin_iteration):
|
||||||
|
|||||||
@ -703,7 +703,7 @@ class Dart : public GBTree {
|
|||||||
weight_drop[i] *= factor;
|
weight_drop[i] *= factor;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < size_new_trees; ++i) {
|
for (size_t i = 0; i < size_new_trees; ++i) {
|
||||||
weight_drop.push_back(lr * factor);
|
weight_drop.push_back(factor);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// normalize_type 0
|
// normalize_type 0
|
||||||
@ -712,7 +712,7 @@ class Dart : public GBTree {
|
|||||||
weight_drop[i] *= factor;
|
weight_drop[i] *= factor;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < size_new_trees; ++i) {
|
for (size_t i = 0; i < size_new_trees; ++i) {
|
||||||
weight_drop.push_back(1.0 * lr / (num_drop + lr));
|
weight_drop.push_back(1.0 / (num_drop + lr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user