Add dump_format=json option (#1726)

* Add format to the params accepted by DumpModel Currently, only the test format is supported when trying to dump a model. The plan is to add more such formats like JSON which are easy to read and/or parse by machines. And to make the interface for this even more generic to allow other formats to be added. Hence, we make some modifications to make these function generic and accept a new parameter "format" which signifies the format of the dump to be created. * Fix typos and errors in docs * plugin: Mention all the register macros available Document the register macros currently available to the plugin writers so they know what exactly can be extended using hooks. * sparce_page_source: Use same arg name in .h and .cc * gbm: Add JSON dump The dump_format argument can be used to specify what type of dump file should be created. Add functionality to dump gblinear and gbtree into a JSON file. The JSON file has an array, each item is a JSON object for the tree. For gblinear: - The item is the bias and weights vectors For gbtree: - The item is the root node. The root node has a attribute "children" which holds the children nodes. This happens recursively. * core.py: Add arg dump_format for get_dump()
2016-11-04 22:25:25 +05:30
parent 9c693f0f5f
commit b94fcab4dc
16 changed files with 320 additions and 92 deletions
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -224,16 +224,35 @@ class GBLinear : public GradientBooster {
    LOG(FATAL) << "gblinear does not support predict leaf index";
  }

-  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const override {
+  std::vector<std::string> DumpModel(const FeatureMap& fmap,
+                                     bool with_stats,
+                                     std::string format) const override {
    std::stringstream fo("");
-    fo << "bias:\n";
-    for (int i = 0; i < model.param.num_output_group; ++i) {
-      fo << model.bias()[i] << std::endl;
-    }
-    fo << "weight:\n";
-    for (int i = 0; i < model.param.num_output_group; ++i) {
-      for (unsigned j = 0; j <model.param.num_feature; ++j) {
-        fo << model[i][j] << std::endl;
+    if (format == "json") {
+      fo << "  { \"bias\": [" << std::endl;
+      for (int i = 0; i < model.param.num_output_group; ++i) {
+        if (i != 0) fo << "," << std::endl;
+        fo << "      " << model.bias()[i];
+      }
+      fo << std::endl << "    ]," << std::endl
+         << "    \"weight\": [" << std::endl;
+      for (int i = 0; i < model.param.num_output_group; ++i) {
+        for (unsigned j = 0; j < model.param.num_feature; ++j) {
+          if (i != 0 || j != 0) fo << "," << std::endl;
+          fo << "      " << model[i][j];
+        }
+      }
+      fo << std::endl << "    ]" << std::endl << "  }";
+    } else {
+      fo << "bias:\n";
+      for (int i = 0; i < model.param.num_output_group; ++i) {
+        fo << model.bias()[i] << std::endl;
+      }
+      fo << "weight:\n";
+      for (int i = 0; i < model.param.num_output_group; ++i) {
+        for (unsigned j = 0; j <model.param.num_feature; ++j) {
+          fo << model[i][j] << std::endl;
+        }
      }
    }
    std::vector<std::string> v;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -64,7 +64,7 @@ struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
  // declare parameters
  DMLC_DECLARE_PARAMETER(DartTrainParam) {
    DMLC_DECLARE_FIELD(silent).set_default(false)
-        .describe("Not print information during trainig.");
+        .describe("Not print information during training.");
    DMLC_DECLARE_FIELD(sample_type).set_default(0)
        .add_enum("uniform", 0)
        .add_enum("weighted", 1)
@@ -275,10 +275,12 @@ class GBTree : public GradientBooster {
    this->PredPath(p_fmat, out_preds, ntree_limit);
  }

-  std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) const override {
+  std::vector<std::string> DumpModel(const FeatureMap& fmap,
+                                     bool with_stats,
+                                     std::string format) const override {
    std::vector<std::string> dump;
    for (size_t i = 0; i < trees.size(); i++) {
-      dump.push_back(trees[i]->Dump2Text(fmap, option & 1));
+      dump.push_back(trees[i]->DumpModel(fmap, with_stats, format));
    }
    return dump;
  }