Merge branch 'master' into master
This commit is contained in:
commit
e3fa7753f5
3
NEWS.md
3
NEWS.md
@ -12,6 +12,9 @@ This file records the changes in xgboost library in reverse chronological order.
|
||||
- Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
|
||||
- Future plugin modules can be put into xgboost/plugin and register back to the library.
|
||||
- Remove most of the raw pointers to smart ptrs, for RAII safety.
|
||||
* Add official option to approximate algorithm `tree_method` to parameter.
|
||||
- Change default behavior to switch to prefer faster algorithm.
|
||||
- User will get a message when approximate algorithm is chosen.
|
||||
* Change library name to libxgboost.so
|
||||
* Backward compatiblity
|
||||
- The binary buffer file is not backward compatible with previous version.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
R package for xgboost
|
||||
=====================
|
||||
XGBoost R Package for Scalable GBM
|
||||
==================================
|
||||
|
||||
[](http://cran.r-project.org/web/packages/xgboost)
|
||||
[](http://cran.rstudio.com/web/packages/xgboost/index.html)
|
||||
|
||||
@ -18,3 +18,8 @@ Contents
|
||||
--------
|
||||
* [Java Overview Tutorial](java_intro.md)
|
||||
* [Code Examples](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example)
|
||||
* [Java API Docs](http://dmlc.ml/docs/javadocs/index.html)
|
||||
* [Scala API Docs]
|
||||
* [XGBoost4J](http://dmlc.ml/docs/scaladocs/xgboost4j/index.html)
|
||||
* [XGBoost4J-Spark](http://dmlc.ml/docs/scaladocs/xgboost4j-spark/index.html)
|
||||
* [XGBoost4J-Flink](http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html)
|
||||
@ -53,6 +53,24 @@ Parameters for Tree Booster
|
||||
- L2 regularization term on weights
|
||||
* alpha [default=0]
|
||||
- L1 regularization term on weights
|
||||
* tree_method, string [default='auto']
|
||||
- The tree constructtion algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754))
|
||||
- Distributed and external memory version only support approximate algorithm.
|
||||
- Choices: {'auto', 'exact', 'approx'}
|
||||
- 'auto': Use heuristic to choose faster one.
|
||||
- For small to medium dataset, exact greedy will be used.
|
||||
- For very large-dataset, approximate algorithm will be choosed.
|
||||
- Because old behavior is always use exact greedy in single machine,
|
||||
user will get a message when approximate algorithm is choosed to notify this choice.
|
||||
- 'exact': Exact greedy algorithm.
|
||||
- 'approx': Approximate greedy algorithm using sketching and histogram.
|
||||
* sketch_eps, [default=0.03]
|
||||
- This is only used for approximate greedy algorithm.
|
||||
- This roughly translated into ```O(1 / sketch_eps)``` number of bins.
|
||||
Compared to directly select number of bins, this comes with theoretical ganrantee with sketch accuracy.
|
||||
- Usuaully user do not have to tune this.
|
||||
but consider set to lower number for more accurate enumeration.
|
||||
- range: (0, 1)
|
||||
|
||||
Parameters for Linear Booster
|
||||
-----------------------------
|
||||
|
||||
@ -56,7 +56,12 @@ object DistTrainWithSpark {
|
||||
"usage: program num_of_rounds training_path model_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
val sc = new SparkContext()
|
||||
// if you do not want to use KryoSerializer in Spark, you can ignore the related configuration
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val sc = new SparkContext(sparkConf)
|
||||
val sc = new SparkContext(sparkConf)
|
||||
val inputTrainPath = args(1)
|
||||
val outputModelPath = args(2)
|
||||
// number of iterations
|
||||
|
||||
@ -48,6 +48,41 @@
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.10.3</version>
|
||||
<configuration>
|
||||
<excludePackageNames>
|
||||
ml.dmlc.xgboost4j.java.example
|
||||
</excludePackageNames>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.0</version>
|
||||
<configuration>
|
||||
<reportPlugins>
|
||||
<plugin>
|
||||
<artifactId>maven-project-info-reports-plugin</artifactId>
|
||||
<version>2.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<configuration>
|
||||
<jvmArgs>
|
||||
<jvmArg>-Xms64m</jvmArg>
|
||||
<jvmArg>-Xmx1024m</jvmArg>
|
||||
</jvmArgs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
...
|
||||
</reportPlugins>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-checkstyle-plugin</artifactId>
|
||||
@ -135,6 +170,11 @@
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.esotericsoftware.kryo</groupId>
|
||||
<artifactId>kryo</artifactId>
|
||||
<version>2.21</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-compiler</artifactId>
|
||||
|
||||
@ -16,9 +16,9 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.example.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
|
||||
import ml.dmlc.xgboost4j.scala.spark.{DataUtils, XGBoost}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.mllib.util.MLUtils
|
||||
|
||||
object DistTrainWithSpark {
|
||||
@ -28,7 +28,10 @@ object DistTrainWithSpark {
|
||||
"usage: program num_of_rounds num_workers training_path test_path model_path")
|
||||
sys.exit(1)
|
||||
}
|
||||
val sc = new SparkContext()
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val sc = new SparkContext(sparkConf)
|
||||
val inputTrainPath = args(2)
|
||||
val inputTestPath = args(3)
|
||||
val outputModelPath = args(4)
|
||||
|
||||
@ -29,8 +29,8 @@ import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.scalatest.{BeforeAndAfter, FunSuite}
|
||||
|
||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError}
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
|
||||
import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix, XGBoostError}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
|
||||
|
||||
class XGBoostSuite extends FunSuite with BeforeAndAfter {
|
||||
|
||||
@ -171,4 +171,23 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
|
||||
}
|
||||
customSparkContext.stop()
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
sc.stop()
|
||||
sc = null
|
||||
val eval = new EvalError()
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
val trainingRDD = buildTrainingRDD(Some(customSparkContext))
|
||||
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
|
||||
assert(eval.eval(xgBoostModel.predict(testSetDMatrix), testSetDMatrix) < 0.1)
|
||||
customSparkContext.stop()
|
||||
}
|
||||
}
|
||||
|
||||
@ -20,13 +20,17 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.esotericsoftware.kryo.Kryo;
|
||||
import com.esotericsoftware.kryo.KryoSerializable;
|
||||
import com.esotericsoftware.kryo.io.Input;
|
||||
import com.esotericsoftware.kryo.io.Output;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Booster for xgboost, this is a model API that support interactive build of a XGBoost Model
|
||||
*/
|
||||
public class Booster implements Serializable {
|
||||
public class Booster implements Serializable, KryoSerializable {
|
||||
private static final Log logger = LogFactory.getLog(Booster.class);
|
||||
// handle to the booster.
|
||||
private long handle = 0;
|
||||
@ -436,7 +440,8 @@ public class Booster implements Serializable {
|
||||
try {
|
||||
out.writeObject(this.toByteArray());
|
||||
} catch (XGBoostError ex) {
|
||||
throw new IOException(ex.toString());
|
||||
ex.printStackTrace();
|
||||
logger.error(ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -447,7 +452,8 @@ public class Booster implements Serializable {
|
||||
byte[] bytes = (byte[])in.readObject();
|
||||
JNIErrorHandle.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(this.handle, bytes));
|
||||
} catch (XGBoostError ex) {
|
||||
throw new IOException(ex.toString());
|
||||
ex.printStackTrace();
|
||||
logger.error(ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -463,4 +469,33 @@ public class Booster implements Serializable {
|
||||
handle = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Kryo kryo, Output output) {
|
||||
try {
|
||||
byte[] serObj = this.toByteArray();
|
||||
int serObjSize = serObj.length;
|
||||
System.out.println("==== serialized obj size " + serObjSize);
|
||||
output.writeInt(serObjSize);
|
||||
output.write(serObj);
|
||||
} catch (XGBoostError ex) {
|
||||
ex.printStackTrace();
|
||||
logger.error(ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(Kryo kryo, Input input) {
|
||||
try {
|
||||
this.init(null);
|
||||
int serObjSize = input.readInt();
|
||||
System.out.println("==== the size of the object: " + serObjSize);
|
||||
byte[] bytes = new byte[serObjSize];
|
||||
input.readBytes(bytes);
|
||||
JNIErrorHandle.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(this.handle, bytes));
|
||||
} catch (XGBoostError ex) {
|
||||
ex.printStackTrace();
|
||||
logger.error(ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -18,12 +18,15 @@ package ml.dmlc.xgboost4j.scala
|
||||
|
||||
import java.io.IOException
|
||||
|
||||
import com.esotericsoftware.kryo.io.{Output, Input}
|
||||
import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
|
||||
import ml.dmlc.xgboost4j.java.{Booster => JBooster}
|
||||
import ml.dmlc.xgboost4j.java.XGBoostError
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
|
||||
class Booster private[xgboost4j](booster: JBooster) extends Serializable {
|
||||
class Booster private[xgboost4j](private var booster: JBooster)
|
||||
extends Serializable with KryoSerializable {
|
||||
|
||||
/**
|
||||
* Set parameter to the Booster.
|
||||
@ -193,4 +196,12 @@ class Booster private[xgboost4j](booster: JBooster) extends Serializable {
|
||||
super.finalize()
|
||||
dispose
|
||||
}
|
||||
|
||||
override def write(kryo: Kryo, output: Output): Unit = {
|
||||
kryo.writeObject(output, booster)
|
||||
}
|
||||
|
||||
override def read(kryo: Kryo, input: Input): Unit = {
|
||||
booster = kryo.readObject(input, classOf[JBooster])
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
* \brief Implementation of learning algorithm.
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/learner.h>
|
||||
#include <dmlc/io.h>
|
||||
#include <algorithm>
|
||||
@ -69,6 +70,8 @@ struct LearnerTrainParam
|
||||
bool seed_per_iteration;
|
||||
// data split mode, can be row, col, or none.
|
||||
int dsplit;
|
||||
// tree construction method
|
||||
int tree_method;
|
||||
// internal test flag
|
||||
std::string test_flag;
|
||||
// maximum buffered row value
|
||||
@ -87,6 +90,11 @@ struct LearnerTrainParam
|
||||
.add_enum("col", 1)
|
||||
.add_enum("row", 2)
|
||||
.describe("Data split mode for distributed trainig. ");
|
||||
DMLC_DECLARE_FIELD(tree_method).set_default(0)
|
||||
.add_enum("auto", 0)
|
||||
.add_enum("approx", 1)
|
||||
.add_enum("exact", 2)
|
||||
.describe("Choice of tree construction method.");
|
||||
DMLC_DECLARE_FIELD(test_flag).set_default("")
|
||||
.describe("Internal test flag");
|
||||
DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
|
||||
@ -349,21 +357,42 @@ class LearnerImpl : public Learner {
|
||||
// check if p_train is ready to used by training.
|
||||
// if not, initialize the column access.
|
||||
inline void LazyInitDMatrix(DMatrix *p_train) {
|
||||
if (p_train->HaveColAccess()) return;
|
||||
int ncol = static_cast<int>(p_train->info().num_col);
|
||||
std::vector<bool> enabled(ncol, true);
|
||||
// set max row per batch to limited value
|
||||
// in distributed mode, use safe choice otherwise
|
||||
size_t max_row_perbatch = tparam.max_row_perbatch;
|
||||
if (tparam.test_flag == "block" || tparam.dsplit == 2) {
|
||||
max_row_perbatch = std::min(
|
||||
static_cast<size_t>(32UL << 10UL), max_row_perbatch);
|
||||
if (!p_train->HaveColAccess()) {
|
||||
int ncol = static_cast<int>(p_train->info().num_col);
|
||||
std::vector<bool> enabled(ncol, true);
|
||||
// set max row per batch to limited value
|
||||
// in distributed mode, use safe choice otherwise
|
||||
size_t max_row_perbatch = tparam.max_row_perbatch;
|
||||
const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
|
||||
|
||||
if (tparam.tree_method == 0 &&
|
||||
p_train->info().num_row >= (4UL << 20UL)) {
|
||||
LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
|
||||
<< " for faster speed."
|
||||
<< " to use old behavior(exact greedy algorithm on single machine),"
|
||||
<< " set tree_method to \'exact\'";
|
||||
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||
}
|
||||
|
||||
if (tparam.tree_method == 1) {
|
||||
LOG(CONSOLE) << "Tree method is selected to be \'approx\'";
|
||||
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||
}
|
||||
|
||||
if (tparam.test_flag == "block" || tparam.dsplit == 2) {
|
||||
max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
|
||||
}
|
||||
// initialize column access
|
||||
p_train->InitColAccess(enabled,
|
||||
tparam.prob_buffer_row,
|
||||
max_row_perbatch);
|
||||
}
|
||||
// initialize column access
|
||||
p_train->InitColAccess(enabled,
|
||||
tparam.prob_buffer_row,
|
||||
max_row_perbatch);
|
||||
|
||||
if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
|
||||
if (tparam.tree_method == 2) {
|
||||
LOG(CONSOLE) << "tree method is set to be 'exact',"
|
||||
<< " but currently we are only able to proceed with approximate algorithm";
|
||||
}
|
||||
cfg_["updater"] = "grow_histmaker,prune";
|
||||
if (gbm_.get() != nullptr) {
|
||||
gbm_->Configure(cfg_.begin(), cfg_.end());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user