Example JSON model parser and Schema. (#5137)

2019-12-23 19:47:35 +08:00 · 2019-12-23 19:47:35 +08:00 · 1d0ca49761
commit 1d0ca49761
parent a4b929385e
8 changed files with 655 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,7 +17,7 @@
 *.tar.gz
 *conf
 *buffer
-*model
+*.model
 *pyc
 *.train
 *.test
--- a/demo/json-model/README.md
+++ b/demo/json-model/README.md
@ -0,0 +1,3 @@
+We introduced initial support for saving XGBoost model in JSON format in 1.0.0.  Note that
+it's still experimental and under development, output schema is subject to change due to
+bug fixes or further refactoring.  For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html .
--- a/demo/json-model/json_parser.py
+++ b/demo/json-model/json_parser.py
@ -0,0 +1,180 @@
+'''Demonstration for parsing JSON tree model file generated by XGBoost.  The
+support is experimental, output schema is subject to change in the future.
+'''
+import json
+import argparse
+
+
+class Tree:
+    '''A tree built by XGBoost.'''
+    # Index into node array
+    _left = 0
+    _right = 1
+    _parent = 2
+    _ind = 3
+    _cond = 4
+    _default_left = 5
+    # Index into stat array
+    _loss_chg = 0
+    _sum_hess = 1
+    _base_weight = 2
+    _child_cnt = 3
+
+    def __init__(self, tree_id: int, nodes, stats):
+        self.tree_id = tree_id
+        self.nodes = nodes
+        self.stats = stats
+
+    def loss_change(self, node_id: int):
+        '''Loss gain of a node.'''
+        return self.stats[node_id][self._loss_chg]
+
+    def sum_hessian(self, node_id: int):
+        '''Sum Hessian of a node.'''
+        return self.stats[node_id][self._sum_hess]
+
+    def base_weight(self, node_id: int):
+        '''Base weight of a node.'''
+        return self.stats[node_id][self._base_weight]
+
+    def num_children(self, node_id: int):
+        '''Number of children of a node.'''
+        return self.stats[node_id][self._child_cnt]
+
+    def split_index(self, node_id: int):
+        '''Split feature index of node.'''
+        return self.nodes[node_id][self._ind]
+
+    def split_condition(self, node_id: int):
+        '''Split value of a node.'''
+        return self.nodes[node_id][self._cond]
+
+    def parent(self, node_id: int):
+        '''Parent ID of a node.'''
+        return self.nodes[node_id][self._parent]
+
+    def left_child(self, node_id: int):
+        '''Left child ID of a node.'''
+        return self.nodes[node_id][self._left]
+
+    def right_child(self, node_id: int):
+        '''Right child ID of a node.'''
+        return self.nodes[node_id][self._right]
+
+    def is_leaf(self, node_id: int):
+        '''Whether a node is leaf.'''
+        return self.nodes[node_id][self._left] == -1
+
+    def is_deleted(self, node_id: int):
+        '''Whether a node is deleted.'''
+        # std::numeric_limits<uint32_t>::max()
+        return self.nodes[node_id][self._ind] == 4294967295
+
+    def __str__(self):
+        stacks = [0]
+        nodes = []
+        while stacks:
+            node = {}
+            nid = stacks.pop()
+
+            node['node id'] = nid
+            node['gain'] = self.loss_change(nid)
+            node['cover'] = self.sum_hessian(nid)
+            nodes.append(node)
+
+            if not self.is_leaf(nid) and not self.is_deleted(nid):
+                left = self.left_child(nid)
+                right = self.right_child(nid)
+                stacks.append(left)
+                stacks.append(right)
+
+        string = '\n'.join(map(lambda x: '  ' + str(x), nodes))
+        return string
+
+
+class Model:
+    '''Gradient boosted tree model.'''
+    def __init__(self, m: dict):
+        '''Construct the Model from JSON object.
+
+         parameters
+         ----------
+          m: A dictionary loaded by json
+        '''
+        # Basic property of a model
+        self.learner_model_shape = model['learner']['learner_model_param']
+        self.num_output_group = int(self.learner_model_shape['num_class'])
+        self.num_feature = int(self.learner_model_shape['num_feature'])
+        self.base_score = float(self.learner_model_shape['base_score'])
+        # A field encoding which output group a tree belongs
+        self.tree_info = model['learner']['gradient_booster']['model'][
+            'tree_info']
+
+        model_shape = model['learner']['gradient_booster']['model'][
+            'gbtree_model_param']
+
+        # JSON representation of trees
+        j_trees = model['learner']['gradient_booster']['model']['trees']
+
+        # Load the trees
+        self.num_trees = int(model_shape['num_trees'])
+        self.leaf_size = int(model_shape['size_leaf_vector'])
+        # Right now XGBoost doesn't support vector leaf yet
+        assert self.leaf_size == 0, str(self.leaf_size)
+
+        trees = []
+        for i in range(self.num_trees):
+            tree = j_trees[i]
+            tree_id = int(tree['id'])
+            assert tree_id == i, (tree_id, i)
+            # properties
+            left_children = tree['left_children']
+            right_children = tree['right_children']
+            parents = tree['parents']
+            split_conditions = tree['split_conditions']
+            split_indices = tree['split_indices']
+            default_left = tree['default_left']
+            # stats
+            base_weights = tree['base_weights']
+            loss_changes = tree['loss_changes']
+            sum_hessian = tree['sum_hessian']
+            leaf_child_counts = tree['leaf_child_counts']
+
+            stats = []
+            nodes = []
+            # We resemble the structure used inside XGBoost, which is similar
+            # to adjacency list.
+            for node_id in range(len(left_children)):
+                nodes.append([
+                    left_children[node_id], right_children[node_id],
+                    parents[node_id], split_indices[node_id],
+                    split_conditions[node_id], default_left[node_id]
+                ])
+                stats.append([
+                    loss_changes[node_id], sum_hessian[node_id],
+                    base_weights[node_id], leaf_child_counts[node_id]
+                ])
+
+            tree = Tree(tree_id, nodes, stats)
+            trees.append(tree)
+
+        self.trees = trees
+
+    def print_model(self):
+        for i, tree in enumerate(self.trees):
+            print('tree_id:', i)
+            print(tree)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Demonstration for loading and printing XGBoost model.')
+    parser.add_argument('--model',
+                        type=str,
+                        required=True,
+                        help='Path to JSON model file.')
+    args = parser.parse_args()
+    with open(args.model, 'r') as fd:
+        model = json.load(fd)
+    model = Model(model)
+    model.print_model()
--- a/doc/model.schema
+++ b/doc/model.schema
@ -0,0 +1,413 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "definitions": {
+    "gbtree_model_param": {
+      "type": "object",
+      "properties": {
+        "num_trees": {
+          "type": "string"
+        },
+        "size_leaf_vector": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "num_trees",
+        "size_leaf_vector"
+      ]
+    },
+    "tree_param": {
+      "type": "object",
+      "properties": {
+        "num_nodes": {
+          "type": "string"
+        },
+        "size_leaf_vector": {
+          "type": "string"
+        },
+        "num_feature": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "num_nodes",
+        "num_feature",
+        "size_leaf_vector"
+      ]
+    },
+
+    "reg_loss_param": {
+      "type": "object",
+      "properties": {
+        "scale_pos_weight": {
+          "type": "string"
+        }
+      }
+    },
+    "softmax_multiclass_param": {
+      "type": "object",
+      "properties": {
+	"num_class": { "type": "string" }
+      }
+    },
+    "lambda_rank_param": {
+      "type": "object",
+      "properties": {
+	"num_pairsample": { "type": "string" },
+	"fix_list_weight": { "type": "string" }
+      }
+    }
+  },
+  "type": "object",
+  "properties": {
+    "version": {
+      "type": "array",
+      "const": [
+        1,
+        0,
+        0
+      ],
+      "additionalItems": false
+    },
+    "learner": {
+      "type": "object",
+      "properties": {
+        "gradient_booster": {
+          "oneOf": [
+            {
+              "type": "object",
+              "properties": {
+                "name": {
+                  "const": "gbtree"
+                },
+                "model": {
+                  "type": "object",
+                  "properties": {
+                    "gbtree_model_param": {
+                      "$ref": "#/definitions/gbtree_model_param"
+                    },
+                    "trees": {
+                      "type": "array",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "tree_param": {
+                            "type": "object",
+                            "properties": {
+                              "num_nodes": {
+                                "type": "string"
+                              },
+                              "size_leaf_vector": {
+                                "type": "string"
+                              },
+                              "num_feature": {
+                                "type": "string"
+                              }
+                            },
+                            "required": [
+                              "num_nodes",
+                              "num_feature",
+                              "size_leaf_vector"
+                            ]
+                          },
+                          "id": {
+                            "type": "integer"
+                          },
+                          "loss_changes": {
+                            "type": "array",
+                            "items": {
+                              "type": "number"
+                            }
+                          },
+                          "sum_hessian": {
+                            "type": "array",
+                            "items": {
+                              "type": "number"
+                            }
+                          },
+                          "base_weights": {
+                            "type": "array",
+                            "items": {
+                              "type": "number"
+                            }
+                          },
+                          "leaf_child_counts": {
+                            "type": "array",
+                            "items": {
+                              "type": "integer"
+                            }
+                          },
+                          "left_children": {
+                            "type": "array",
+                            "items": {
+                              "type": "integer"
+                            }
+                          },
+                          "right_children": {
+                            "type": "array",
+                            "items": {
+                              "type": "integer"
+                            }
+                          },
+                          "parents": {
+                            "type": "array",
+                            "items": {
+                              "type": "integer"
+                            }
+                          },
+                          "split_indices": {
+                            "type": "array",
+                            "items": {
+                              "type": "integer"
+                            }
+                          },
+                          "split_conditions": {
+                            "type": "array",
+                            "items": {
+                              "type": "number"
+                            }
+                          },
+                          "default_left": {
+                            "type": "array",
+                            "items": {
+                              "type": "boolean"
+                            }
+                          }
+                        },
+                        "required": [
+                          "tree_param",
+                          "loss_changes",
+                          "sum_hessian",
+                          "base_weights",
+                          "leaf_child_counts",
+                          "left_children",
+                          "right_children",
+                          "parents",
+                          "split_indices",
+                          "split_conditions",
+                          "default_left"
+                        ]
+                      }
+                    },
+                    "tree_info": {
+                      "type": "array",
+                      "items": {
+                        "type": "integer"
+                      }
+                    }
+                  },
+                  "required": [
+                    "gbtree_model_param",
+                    "trees"
+                  ]
+                }
+              },
+              "required": [
+                "name",
+                "model"
+              ]
+            },
+	    {
+	      "type": "object",
+	      "properties": {
+		"name": { "const": "gblinear" },
+		"model": {
+		  "type": "object",
+		  "properties": {
+		    "weights": {
+		      "type": "array",
+		      "items": {
+			"type": "number"
+		      }
+		    }
+		  }
+		}
+	      }
+	    }
+          ]
+        },
+
+        "objective": {
+          "oneOf": [
+            {
+              "type": "object",
+              "properties": {
+                "name": { "const": "reg:squarederror" },
+		"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
+              },
+              "required": [
+                "name",
+                "reg_loss_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "reg:squaredlogerror" },
+		"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
+              },
+              "required": [
+                "name",
+                "reg_loss_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "reg:logistic" },
+		"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
+              },
+              "required": [
+                "name",
+                "reg_loss_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "binary:logistic" },
+		"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
+              },
+              "required": [
+                "name",
+                "reg_loss_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "binary:logitraw" },
+		"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
+              },
+              "required": [
+                "name",
+                "reg_loss_param"
+              ]
+            },
+
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "count:poisson" },
+		"poisson_regression_param": {
+		  "type": "object",
+		  "properties": {
+		    "max_delta_step": { "type": "string" }
+		  }
+		}
+              },
+              "required": [
+                "name",
+                "poisson_regression_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "reg:tweedie" },
+		"tweedie_regression_param": {
+		  "type": "object",
+		  "properties": {
+		    "tweedie_variance_power": { "type": "string" }
+		  }
+		}
+              },
+              "required": [
+                "name",
+                "tweedie_regression_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "survival:cox" }
+              },
+              "required": [ "name" ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "reg:gamma" }
+              },
+              "required": [ "name" ]
+            },
+
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "multi:softprob" },
+		"softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
+              },
+              "required": [
+                "name",
+                "softmax_multiclass_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "multi:softmax" },
+		"softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
+              },
+              "required": [
+                "name",
+                "softmax_multiclass_param"
+              ]
+            },
+
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "rank:pairwise" },
+		"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+              },
+              "required": [
+                "name",
+                "lambda_rank_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "rank:ndcg" },
+		"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+              },
+              "required": [
+                "name",
+                "lambda_rank_param"
+              ]
+            },
+	    {
+              "type": "object",
+              "properties": {
+                "name": { "const": "rank:map" },
+		"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+              },
+              "required": [
+                "name",
+                "lambda_rank_param"
+              ]
+            }
+          ]
+        },
+
+	"learner_model_param": {
+	  "type": "object",
+	  "properties": {
+	    "base_score": { "type": "string" },
+	    "num_class": { "type": "string" },
+	    "num_feature": { "type": "string" }
+	  }
+	}
+      },
+      "required": [
+        "gradient_booster",
+        "objective"
+      ]
+    }
+  },
+  "required": [
+    "version",
+    "learner"
+  ]
+}
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@ -191,5 +191,16 @@ Future Plans
 Right now using the JSON format incurs longer serialisation time, we have been working on
 optimizing the JSON implementation to close the gap between binary format and JSON format.
 You can track the progress in `#5046 <https://github.com/dmlc/xgboost/pull/5046>`_.
-Another important item for JSON format support is a stable and documented `schema
-<https://json-schema.org/>`_, based on which one can easily reuse the saved model.
+
+***********
+JSON Schema
+***********
+
+Another important feature of JSON format is a documented `Schema
+<https://json-schema.org/>`_, based on which one can easily reuse the output model from
+XGBoost.  Here is the initial draft of JSON schema for the output model (not
+serialization, which will not be stable as noted above).  It's subject to change due to
+the beta status.  For an example of parsing XGBoost tree model, see ``/demo/json-model``.
+
+.. include:: ../model.schema
+   :code: json
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10
 # Install Python packages
 RUN \
    pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh recommonmark guzzle_sphinx_theme mock \
-                breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 && \
+                breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 jsonschema && \
    pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
    pip install "dask[complete]"

--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@ -3,6 +3,8 @@ import xgboost as xgb
 import unittest
 import os
 import json
+import testing as tm
+import pytest

 dpath = 'demo/data/'
 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
@ -11,6 +13,20 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
 rng = np.random.RandomState(1994)


+def json_model(model_path):
+    X = np.random.random((10, 3))
+    y = np.random.randint(2, size=(10,))
+
+    dm1 = xgb.DMatrix(X, y)
+
+    bst = xgb.train({'tree_method': 'hist'}, dm1)
+    bst.save_model(model_path)
+
+    with open(model_path, 'r') as fd:
+        model = json.load(fd)
+    return model
+
+
 class TestModels(unittest.TestCase):
    def test_glm(self):
        param = {'verbosity': 0, 'objective': 'binary:logistic',
@ -42,8 +58,9 @@ class TestModels(unittest.TestCase):

        # save dmatrix into binary buffer
        dtest.save_binary('dtest.buffer')
+        model_path = 'xgb.model.dart'
        # save model
-        bst.save_model('xgb.model.dart')
+        bst.save_model(model_path)
        # load model and data in
        bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
        dtest2 = xgb.DMatrix('dtest.buffer')
@ -69,6 +86,7 @@ class TestModels(unittest.TestCase):
        for ii in range(len(preds_list)):
            for jj in range(ii + 1, len(preds_list)):
                assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
+        os.remove(model_path)

    def test_eta_decay(self):
        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@ -204,21 +222,27 @@ class TestModels(unittest.TestCase):
        bst.predict(dm2)  # success

    def test_model_json_io(self):
-        X = np.random.random((10, 3))
-        y = np.random.randint(2, size=(10,))
-
-        dm1 = xgb.DMatrix(X, y)
-        bst = xgb.train({'tree_method': 'hist'}, dm1)
-        bst.save_model('./model.json')
-
-        with open('./model.json', 'r') as fd:
-            j_model = json.load(fd)
+        model_path = './model.json'
+        j_model = json_model(model_path)
        assert isinstance(j_model['learner'], dict)

        bst = xgb.Booster(model_file='./model.json')

+        bst.save_model(fname=model_path)
        with open('./model.json', 'r') as fd:
            j_model = json.load(fd)
        assert isinstance(j_model['learner'], dict)

-        os.remove('model.json')
+        os.remove(model_path)
+
+    @pytest.mark.skipif(**tm.no_json_schema())
+    def test_json_schema(self):
+        import jsonschema
+        model_path = './model.json'
+        path = os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        doc = os.path.join(path, 'doc', 'model.schema')
+        with open(doc, 'r') as fd:
+            schema = json.load(fd)
+        jsonschema.validate(instance=json_model(model_path), schema=schema)
+        os.remove(model_path)
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@ -55,3 +55,12 @@ def no_dask_cudf():
        return {'condition': False, 'reason': reason}
    except ImportError:
        return {'condition': True, 'reason': reason}
+
+
+def no_json_schema():
+    reason = 'jsonschema is not installed'
+    try:
+        import jsonschema       # noqa
+        return {'condition': False, 'reason': reason}
+    except ImportError:
+        return {'condition': True, 'reason': reason}