Merge branch 'master' into master

2016-03-13 22:46:38 -04:00
parent 6f92f1c117 e1b2ad2e5e
commit e3fa7753f5
11 changed files with 193 additions and 25 deletions
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,6 +12,9 @@ This file records the changes in xgboost library in reverse chronological order.
  - Enable registry pattern to allow optionally plugin of objective, metric, tree constructor, data loader.
    - Future plugin modules can be put into xgboost/plugin and register back to the library.
  - Remove most of the raw pointers to smart ptrs, for RAII safety.
+* Add official option to approximate algorithm `tree_method` to parameter.
+  - Change default behavior to switch to prefer faster algorithm.
+  - User will get a message when approximate algorithm is chosen.
 * Change library name to libxgboost.so
 * Backward compatiblity
  - The binary buffer file is not backward compatible with previous version.
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -1,5 +1,5 @@
-R package for xgboost
-=====================
+XGBoost R Package for Scalable GBM
+==================================

 [![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
--- a/doc/jvm/index.md
+++ b/doc/jvm/index.md
@@ -18,3 +18,8 @@ Contents
 --------
 * [Java Overview Tutorial](java_intro.md)
 * [Code Examples](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example)
+* [Java API Docs](http://dmlc.ml/docs/javadocs/index.html)
+* [Scala API Docs]
+  * [XGBoost4J](http://dmlc.ml/docs/scaladocs/xgboost4j/index.html)
+  * [XGBoost4J-Spark](http://dmlc.ml/docs/scaladocs/xgboost4j-spark/index.html)
+  * [XGBoost4J-Flink](http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html)
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -53,6 +53,24 @@ Parameters for Tree Booster
  - L2 regularization term on weights
 * alpha [default=0]
  - L1 regularization term on weights
+* tree_method, string [default='auto']
+  - The tree constructtion algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754))
+  - Distributed and external memory version only support approximate algorithm.
+  - Choices: {'auto', 'exact', 'approx'}
+    - 'auto': Use heuristic to choose faster one.
+      - For small to medium dataset, exact greedy will be used.
+      - For very large-dataset, approximate algorithm will be choosed.
+      - Because old behavior is always use exact greedy in single machine,
+        user will get a message when approximate algorithm is choosed to notify this choice.
+    - 'exact': Exact greedy algorithm.
+    - 'approx': Approximate greedy algorithm using sketching and histogram.
+* sketch_eps, [default=0.03]
+  - This is only used for approximate greedy algorithm.
+  - This roughly translated into ```O(1 / sketch_eps)``` number of bins.
+    Compared to directly select number of bins, this comes with theoretical ganrantee with sketch accuracy.
+  - Usuaully user do not have to tune this.
+    but consider set to lower number for more accurate enumeration.
+  - range: (0, 1)

 Parameters for Linear Booster
 -----------------------------
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@@ -56,7 +56,12 @@ object DistTrainWithSpark {
        "usage: program  num_of_rounds training_path model_path")
      sys.exit(1)
    }
-    val sc = new SparkContext()
+    // if you do not want to use KryoSerializer in Spark, you can ignore the related configuration
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val sc = new SparkContext(sparkConf)
+    val sc = new SparkContext(sparkConf)
    val inputTrainPath = args(1)
    val outputModelPath = args(2)
    // number of iterations
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -48,6 +48,41 @@
                    </execution>
                </executions>
            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <version>2.10.3</version>
+                <configuration>
+                    <excludePackageNames>
+                        ml.dmlc.xgboost4j.java.example
+                    </excludePackageNames>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>maven-site-plugin</artifactId>
+                <version>3.0</version>
+                <configuration>
+                    <reportPlugins>
+                        <plugin>
+                            <artifactId>maven-project-info-reports-plugin</artifactId>
+                            <version>2.2</version>
+                        </plugin>
+                        <plugin>
+                            <groupId>net.alchim31.maven</groupId>
+                            <artifactId>scala-maven-plugin</artifactId>
+                            <version>3.2.1</version>
+                            <configuration>
+                                <jvmArgs>
+                                    <jvmArg>-Xms64m</jvmArg>
+                                    <jvmArg>-Xmx1024m</jvmArg>
+                                </jvmArgs>
+                            </configuration>
+                        </plugin>
+                        ...
+                    </reportPlugins>
+                </configuration>
+            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-checkstyle-plugin</artifactId>
@@ -135,6 +170,11 @@
        </plugins>
    </build>
    <dependencies>
+        <dependency>
+            <groupId>com.esotericsoftware.kryo</groupId>
+            <artifactId>kryo</artifactId>
+            <version>2.21</version>
+        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-compiler</artifactId>
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/DistTrainWithSpark.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/DistTrainWithSpark.scala
@@ -16,9 +16,9 @@

 package ml.dmlc.xgboost4j.scala.example.spark

-import ml.dmlc.xgboost4j.scala.DMatrix
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 import ml.dmlc.xgboost4j.scala.spark.{DataUtils, XGBoost}
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.util.MLUtils

 object DistTrainWithSpark {
@@ -28,7 +28,10 @@ object DistTrainWithSpark {
        "usage: program num_of_rounds num_workers training_path test_path model_path")
      sys.exit(1)
    }
-    val sc = new SparkContext()
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val sc = new SparkContext(sparkConf)
    val inputTrainPath = args(2)
    val inputTestPath = args(3)
    val outputModelPath = args(4)
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@@ -29,8 +29,8 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
 import org.scalatest.{BeforeAndAfter, FunSuite}

-import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, XGBoostError}
-import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
+import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix, XGBoostError}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}

 class XGBoostSuite extends FunSuite with BeforeAndAfter {

@@ -171,4 +171,23 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    }
    customSparkContext.stop()
  }
+
+  test("kryoSerializer test") {
+    sc.stop()
+    sc = null
+    val eval = new EvalError()
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val customSparkContext = new SparkContext(sparkConf)
+    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
+    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    assert(eval.eval(xgBoostModel.predict(testSetDMatrix), testSetDMatrix) < 0.1)
+    customSparkContext.stop()
+  }
 }
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -20,13 +20,17 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.KryoSerializable;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

 /**
 * Booster for xgboost, this is a model API that support interactive build of a XGBoost Model
 */
-public class Booster implements  Serializable {
+public class Booster implements Serializable, KryoSerializable {
  private static final Log logger = LogFactory.getLog(Booster.class);
  // handle to the booster.
  private long handle = 0;
@@ -436,7 +440,8 @@ public class Booster implements  Serializable {
    try {
      out.writeObject(this.toByteArray());
    } catch (XGBoostError ex) {
-      throw new IOException(ex.toString());
+      ex.printStackTrace();
+      logger.error(ex.getMessage());
    }
  }

@@ -447,7 +452,8 @@ public class Booster implements  Serializable {
      byte[] bytes = (byte[])in.readObject();
      JNIErrorHandle.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(this.handle, bytes));
    } catch (XGBoostError ex) {
-      throw new IOException(ex.toString());
+      ex.printStackTrace();
+      logger.error(ex.getMessage());
    }
  }

@@ -463,4 +469,33 @@ public class Booster implements  Serializable {
      handle = 0;
    }
  }
+
+  @Override
+  public void write(Kryo kryo, Output output) {
+    try {
+      byte[] serObj = this.toByteArray();
+      int serObjSize = serObj.length;
+      System.out.println("==== serialized obj size " + serObjSize);
+      output.writeInt(serObjSize);
+      output.write(serObj);
+    } catch (XGBoostError ex) {
+      ex.printStackTrace();
+      logger.error(ex.getMessage());
+    }
+  }
+
+  @Override
+  public void read(Kryo kryo, Input input) {
+    try {
+      this.init(null);
+      int serObjSize = input.readInt();
+      System.out.println("==== the size of the object: " + serObjSize);
+      byte[] bytes = new byte[serObjSize];
+      input.readBytes(bytes);
+      JNIErrorHandle.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(this.handle, bytes));
+    } catch (XGBoostError ex) {
+      ex.printStackTrace();
+      logger.error(ex.getMessage());
+    }
+  }
 }
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
@@ -18,12 +18,15 @@ package ml.dmlc.xgboost4j.scala

 import java.io.IOException

+import com.esotericsoftware.kryo.io.{Output, Input}
+import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
 import ml.dmlc.xgboost4j.java.{Booster => JBooster}
 import ml.dmlc.xgboost4j.java.XGBoostError
 import scala.collection.JavaConverters._
 import scala.collection.mutable

-class Booster private[xgboost4j](booster: JBooster) extends Serializable {
+class Booster private[xgboost4j](private var booster: JBooster)
+  extends Serializable  with KryoSerializable {

  /**
    * Set parameter to the Booster.
@@ -193,4 +196,12 @@ class Booster private[xgboost4j](booster: JBooster) extends Serializable {
    super.finalize()
    dispose
  }
+
+  override def write(kryo: Kryo, output: Output): Unit = {
+    kryo.writeObject(output, booster)
+  }
+
+  override def read(kryo: Kryo, input: Input): Unit = {
+    booster = kryo.readObject(input, classOf[JBooster])
+  }
 }
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -4,6 +4,7 @@
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
 */
+#include <xgboost/logging.h>
 #include <xgboost/learner.h>
 #include <dmlc/io.h>
 #include <algorithm>
@@ -69,6 +70,8 @@ struct LearnerTrainParam
  bool seed_per_iteration;
  // data split mode, can be row, col, or none.
  int dsplit;
+  // tree construction method
+  int tree_method;
  // internal test flag
  std::string test_flag;
  // maximum buffered row value
@@ -87,6 +90,11 @@ struct LearnerTrainParam
        .add_enum("col", 1)
        .add_enum("row", 2)
        .describe("Data split mode for distributed trainig. ");
+    DMLC_DECLARE_FIELD(tree_method).set_default(0)
+        .add_enum("auto", 0)
+        .add_enum("approx", 1)
+        .add_enum("exact", 2)
+        .describe("Choice of tree construction method.");
    DMLC_DECLARE_FIELD(test_flag).set_default("")
        .describe("Internal test flag");
    DMLC_DECLARE_FIELD(prob_buffer_row).set_default(1.0f).set_range(0.0f, 1.0f)
@@ -349,21 +357,42 @@ class LearnerImpl : public Learner {
  // check if p_train is ready to used by training.
  // if not, initialize the column access.
  inline void LazyInitDMatrix(DMatrix *p_train) {
-    if (p_train->HaveColAccess()) return;
-    int ncol = static_cast<int>(p_train->info().num_col);
-    std::vector<bool> enabled(ncol, true);
-    // set max row per batch to limited value
-    // in distributed mode, use safe choice otherwise
-    size_t max_row_perbatch = tparam.max_row_perbatch;
-    if (tparam.test_flag == "block" || tparam.dsplit == 2) {
-      max_row_perbatch = std::min(
-        static_cast<size_t>(32UL << 10UL), max_row_perbatch);
+    if (!p_train->HaveColAccess()) {
+      int ncol = static_cast<int>(p_train->info().num_col);
+      std::vector<bool> enabled(ncol, true);
+      // set max row per batch to limited value
+      // in distributed mode, use safe choice otherwise
+      size_t max_row_perbatch = tparam.max_row_perbatch;
+      const size_t safe_max_row = static_cast<size_t>(32UL << 10UL);
+
+      if (tparam.tree_method == 0 &&
+          p_train->info().num_row >= (4UL << 20UL)) {
+        LOG(CONSOLE) << "Tree method is automatically selected to be \'approx\'"
+                     << " for faster speed."
+                     << " to use old behavior(exact greedy algorithm on single machine),"
+                     << " set tree_method to \'exact\'";
+        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
+      }
+
+      if (tparam.tree_method == 1) {
+        LOG(CONSOLE) << "Tree method is selected to be \'approx\'";
+        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
+      }
+
+      if (tparam.test_flag == "block" || tparam.dsplit == 2) {
+        max_row_perbatch = std::min(max_row_perbatch, safe_max_row);
+      }
+      // initialize column access
+      p_train->InitColAccess(enabled,
+                             tparam.prob_buffer_row,
+                             max_row_perbatch);
    }
-    // initialize column access
-    p_train->InitColAccess(enabled,
-                           tparam.prob_buffer_row,
-                           max_row_perbatch);
+
    if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
+      if (tparam.tree_method == 2) {
+        LOG(CONSOLE) << "tree method is set to be 'exact',"
+                     << " but currently we are only able to proceed with approximate algorithm";
+      }
      cfg_["updater"] = "grow_histmaker,prune";
      if (gbm_.get() != nullptr) {
        gbm_->Configure(cfg_.begin(), cfg_.end());