diff --git a/.gitignore b/.gitignore index f2a13f361..43dea5f43 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,7 @@ *.tar.gz *conf *buffer -*model +*.model *pyc *.train *.test diff --git a/demo/json-model/README.md b/demo/json-model/README.md new file mode 100644 index 000000000..065d854f4 --- /dev/null +++ b/demo/json-model/README.md @@ -0,0 +1,3 @@ +We introduced initial support for saving XGBoost model in JSON format in 1.0.0. Note that +it's still experimental and under development, output schema is subject to change due to +bug fixes or further refactoring. For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html . \ No newline at end of file diff --git a/demo/json-model/json_parser.py b/demo/json-model/json_parser.py new file mode 100644 index 000000000..eedcbf9c2 --- /dev/null +++ b/demo/json-model/json_parser.py @@ -0,0 +1,180 @@ +'''Demonstration for parsing JSON tree model file generated by XGBoost. The +support is experimental, output schema is subject to change in the future. +''' +import json +import argparse + + +class Tree: + '''A tree built by XGBoost.''' + # Index into node array + _left = 0 + _right = 1 + _parent = 2 + _ind = 3 + _cond = 4 + _default_left = 5 + # Index into stat array + _loss_chg = 0 + _sum_hess = 1 + _base_weight = 2 + _child_cnt = 3 + + def __init__(self, tree_id: int, nodes, stats): + self.tree_id = tree_id + self.nodes = nodes + self.stats = stats + + def loss_change(self, node_id: int): + '''Loss gain of a node.''' + return self.stats[node_id][self._loss_chg] + + def sum_hessian(self, node_id: int): + '''Sum Hessian of a node.''' + return self.stats[node_id][self._sum_hess] + + def base_weight(self, node_id: int): + '''Base weight of a node.''' + return self.stats[node_id][self._base_weight] + + def num_children(self, node_id: int): + '''Number of children of a node.''' + return self.stats[node_id][self._child_cnt] + + def split_index(self, node_id: int): + '''Split feature index of node.''' + return self.nodes[node_id][self._ind] + + def split_condition(self, node_id: int): + '''Split value of a node.''' + return self.nodes[node_id][self._cond] + + def parent(self, node_id: int): + '''Parent ID of a node.''' + return self.nodes[node_id][self._parent] + + def left_child(self, node_id: int): + '''Left child ID of a node.''' + return self.nodes[node_id][self._left] + + def right_child(self, node_id: int): + '''Right child ID of a node.''' + return self.nodes[node_id][self._right] + + def is_leaf(self, node_id: int): + '''Whether a node is leaf.''' + return self.nodes[node_id][self._left] == -1 + + def is_deleted(self, node_id: int): + '''Whether a node is deleted.''' + # std::numeric_limits::max() + return self.nodes[node_id][self._ind] == 4294967295 + + def __str__(self): + stacks = [0] + nodes = [] + while stacks: + node = {} + nid = stacks.pop() + + node['node id'] = nid + node['gain'] = self.loss_change(nid) + node['cover'] = self.sum_hessian(nid) + nodes.append(node) + + if not self.is_leaf(nid) and not self.is_deleted(nid): + left = self.left_child(nid) + right = self.right_child(nid) + stacks.append(left) + stacks.append(right) + + string = '\n'.join(map(lambda x: ' ' + str(x), nodes)) + return string + + +class Model: + '''Gradient boosted tree model.''' + def __init__(self, m: dict): + '''Construct the Model from JSON object. + + parameters + ---------- + m: A dictionary loaded by json + ''' + # Basic property of a model + self.learner_model_shape = model['learner']['learner_model_param'] + self.num_output_group = int(self.learner_model_shape['num_class']) + self.num_feature = int(self.learner_model_shape['num_feature']) + self.base_score = float(self.learner_model_shape['base_score']) + # A field encoding which output group a tree belongs + self.tree_info = model['learner']['gradient_booster']['model'][ + 'tree_info'] + + model_shape = model['learner']['gradient_booster']['model'][ + 'gbtree_model_param'] + + # JSON representation of trees + j_trees = model['learner']['gradient_booster']['model']['trees'] + + # Load the trees + self.num_trees = int(model_shape['num_trees']) + self.leaf_size = int(model_shape['size_leaf_vector']) + # Right now XGBoost doesn't support vector leaf yet + assert self.leaf_size == 0, str(self.leaf_size) + + trees = [] + for i in range(self.num_trees): + tree = j_trees[i] + tree_id = int(tree['id']) + assert tree_id == i, (tree_id, i) + # properties + left_children = tree['left_children'] + right_children = tree['right_children'] + parents = tree['parents'] + split_conditions = tree['split_conditions'] + split_indices = tree['split_indices'] + default_left = tree['default_left'] + # stats + base_weights = tree['base_weights'] + loss_changes = tree['loss_changes'] + sum_hessian = tree['sum_hessian'] + leaf_child_counts = tree['leaf_child_counts'] + + stats = [] + nodes = [] + # We resemble the structure used inside XGBoost, which is similar + # to adjacency list. + for node_id in range(len(left_children)): + nodes.append([ + left_children[node_id], right_children[node_id], + parents[node_id], split_indices[node_id], + split_conditions[node_id], default_left[node_id] + ]) + stats.append([ + loss_changes[node_id], sum_hessian[node_id], + base_weights[node_id], leaf_child_counts[node_id] + ]) + + tree = Tree(tree_id, nodes, stats) + trees.append(tree) + + self.trees = trees + + def print_model(self): + for i, tree in enumerate(self.trees): + print('tree_id:', i) + print(tree) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Demonstration for loading and printing XGBoost model.') + parser.add_argument('--model', + type=str, + required=True, + help='Path to JSON model file.') + args = parser.parse_args() + with open(args.model, 'r') as fd: + model = json.load(fd) + model = Model(model) + model.print_model() diff --git a/doc/model.schema b/doc/model.schema new file mode 100644 index 000000000..4e42cdac2 --- /dev/null +++ b/doc/model.schema @@ -0,0 +1,413 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "gbtree_model_param": { + "type": "object", + "properties": { + "num_trees": { + "type": "string" + }, + "size_leaf_vector": { + "type": "string" + } + }, + "required": [ + "num_trees", + "size_leaf_vector" + ] + }, + "tree_param": { + "type": "object", + "properties": { + "num_nodes": { + "type": "string" + }, + "size_leaf_vector": { + "type": "string" + }, + "num_feature": { + "type": "string" + } + }, + "required": [ + "num_nodes", + "num_feature", + "size_leaf_vector" + ] + }, + + "reg_loss_param": { + "type": "object", + "properties": { + "scale_pos_weight": { + "type": "string" + } + } + }, + "softmax_multiclass_param": { + "type": "object", + "properties": { + "num_class": { "type": "string" } + } + }, + "lambda_rank_param": { + "type": "object", + "properties": { + "num_pairsample": { "type": "string" }, + "fix_list_weight": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "version": { + "type": "array", + "const": [ + 1, + 0, + 0 + ], + "additionalItems": false + }, + "learner": { + "type": "object", + "properties": { + "gradient_booster": { + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "gbtree" + }, + "model": { + "type": "object", + "properties": { + "gbtree_model_param": { + "$ref": "#/definitions/gbtree_model_param" + }, + "trees": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tree_param": { + "type": "object", + "properties": { + "num_nodes": { + "type": "string" + }, + "size_leaf_vector": { + "type": "string" + }, + "num_feature": { + "type": "string" + } + }, + "required": [ + "num_nodes", + "num_feature", + "size_leaf_vector" + ] + }, + "id": { + "type": "integer" + }, + "loss_changes": { + "type": "array", + "items": { + "type": "number" + } + }, + "sum_hessian": { + "type": "array", + "items": { + "type": "number" + } + }, + "base_weights": { + "type": "array", + "items": { + "type": "number" + } + }, + "leaf_child_counts": { + "type": "array", + "items": { + "type": "integer" + } + }, + "left_children": { + "type": "array", + "items": { + "type": "integer" + } + }, + "right_children": { + "type": "array", + "items": { + "type": "integer" + } + }, + "parents": { + "type": "array", + "items": { + "type": "integer" + } + }, + "split_indices": { + "type": "array", + "items": { + "type": "integer" + } + }, + "split_conditions": { + "type": "array", + "items": { + "type": "number" + } + }, + "default_left": { + "type": "array", + "items": { + "type": "boolean" + } + } + }, + "required": [ + "tree_param", + "loss_changes", + "sum_hessian", + "base_weights", + "leaf_child_counts", + "left_children", + "right_children", + "parents", + "split_indices", + "split_conditions", + "default_left" + ] + } + }, + "tree_info": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "required": [ + "gbtree_model_param", + "trees" + ] + } + }, + "required": [ + "name", + "model" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "gblinear" }, + "model": { + "type": "object", + "properties": { + "weights": { + "type": "array", + "items": { + "type": "number" + } + } + } + } + } + } + ] + }, + + "objective": { + "oneOf": [ + { + "type": "object", + "properties": { + "name": { "const": "reg:squarederror" }, + "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"} + }, + "required": [ + "name", + "reg_loss_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "reg:squaredlogerror" }, + "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"} + }, + "required": [ + "name", + "reg_loss_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "reg:logistic" }, + "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"} + }, + "required": [ + "name", + "reg_loss_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "binary:logistic" }, + "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"} + }, + "required": [ + "name", + "reg_loss_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "binary:logitraw" }, + "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"} + }, + "required": [ + "name", + "reg_loss_param" + ] + }, + + { + "type": "object", + "properties": { + "name": { "const": "count:poisson" }, + "poisson_regression_param": { + "type": "object", + "properties": { + "max_delta_step": { "type": "string" } + } + } + }, + "required": [ + "name", + "poisson_regression_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "reg:tweedie" }, + "tweedie_regression_param": { + "type": "object", + "properties": { + "tweedie_variance_power": { "type": "string" } + } + } + }, + "required": [ + "name", + "tweedie_regression_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "survival:cox" } + }, + "required": [ "name" ] + }, + { + "type": "object", + "properties": { + "name": { "const": "reg:gamma" } + }, + "required": [ "name" ] + }, + + { + "type": "object", + "properties": { + "name": { "const": "multi:softprob" }, + "softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"} + }, + "required": [ + "name", + "softmax_multiclass_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "multi:softmax" }, + "softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"} + }, + "required": [ + "name", + "softmax_multiclass_param" + ] + }, + + { + "type": "object", + "properties": { + "name": { "const": "rank:pairwise" }, + "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"} + }, + "required": [ + "name", + "lambda_rank_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "rank:ndcg" }, + "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"} + }, + "required": [ + "name", + "lambda_rank_param" + ] + }, + { + "type": "object", + "properties": { + "name": { "const": "rank:map" }, + "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"} + }, + "required": [ + "name", + "lambda_rank_param" + ] + } + ] + }, + + "learner_model_param": { + "type": "object", + "properties": { + "base_score": { "type": "string" }, + "num_class": { "type": "string" }, + "num_feature": { "type": "string" } + } + } + }, + "required": [ + "gradient_booster", + "objective" + ] + } + }, + "required": [ + "version", + "learner" + ] +} diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index 99a6a42b6..62fc34434 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -191,5 +191,16 @@ Future Plans Right now using the JSON format incurs longer serialisation time, we have been working on optimizing the JSON implementation to close the gap between binary format and JSON format. You can track the progress in `#5046 `_. -Another important item for JSON format support is a stable and documented `schema -`_, based on which one can easily reuse the saved model. + +*********** +JSON Schema +*********** + +Another important feature of JSON format is a documented `Schema +`_, based on which one can easily reuse the output model from +XGBoost. Here is the initial draft of JSON schema for the output model (not +serialization, which will not be stable as noted above). It's subject to change due to +the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``. + +.. include:: ../model.schema + :code: json diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 1890a64ea..56f740c63 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10 # Install Python packages RUN \ pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh recommonmark guzzle_sphinx_theme mock \ - breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 && \ + breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 jsonschema && \ pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \ pip install "dask[complete]" diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index eb71fc2fa..be5725a02 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -3,6 +3,8 @@ import xgboost as xgb import unittest import os import json +import testing as tm +import pytest dpath = 'demo/data/' dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') @@ -11,6 +13,20 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') rng = np.random.RandomState(1994) +def json_model(model_path): + X = np.random.random((10, 3)) + y = np.random.randint(2, size=(10,)) + + dm1 = xgb.DMatrix(X, y) + + bst = xgb.train({'tree_method': 'hist'}, dm1) + bst.save_model(model_path) + + with open(model_path, 'r') as fd: + model = json.load(fd) + return model + + class TestModels(unittest.TestCase): def test_glm(self): param = {'verbosity': 0, 'objective': 'binary:logistic', @@ -42,8 +58,9 @@ class TestModels(unittest.TestCase): # save dmatrix into binary buffer dtest.save_binary('dtest.buffer') + model_path = 'xgb.model.dart' # save model - bst.save_model('xgb.model.dart') + bst.save_model(model_path) # load model and data in bst2 = xgb.Booster(params=param, model_file='xgb.model.dart') dtest2 = xgb.DMatrix('dtest.buffer') @@ -69,6 +86,7 @@ class TestModels(unittest.TestCase): for ii in range(len(preds_list)): for jj in range(ii + 1, len(preds_list)): assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0 + os.remove(model_path) def test_eta_decay(self): watchlist = [(dtest, 'eval'), (dtrain, 'train')] @@ -204,21 +222,27 @@ class TestModels(unittest.TestCase): bst.predict(dm2) # success def test_model_json_io(self): - X = np.random.random((10, 3)) - y = np.random.randint(2, size=(10,)) - - dm1 = xgb.DMatrix(X, y) - bst = xgb.train({'tree_method': 'hist'}, dm1) - bst.save_model('./model.json') - - with open('./model.json', 'r') as fd: - j_model = json.load(fd) + model_path = './model.json' + j_model = json_model(model_path) assert isinstance(j_model['learner'], dict) bst = xgb.Booster(model_file='./model.json') + bst.save_model(fname=model_path) with open('./model.json', 'r') as fd: j_model = json.load(fd) assert isinstance(j_model['learner'], dict) - os.remove('model.json') + os.remove(model_path) + + @pytest.mark.skipif(**tm.no_json_schema()) + def test_json_schema(self): + import jsonschema + model_path = './model.json' + path = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + doc = os.path.join(path, 'doc', 'model.schema') + with open(doc, 'r') as fd: + schema = json.load(fd) + jsonschema.validate(instance=json_model(model_path), schema=schema) + os.remove(model_path) diff --git a/tests/python/testing.py b/tests/python/testing.py index 99747e04a..1ff88de1f 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -55,3 +55,12 @@ def no_dask_cudf(): return {'condition': False, 'reason': reason} except ImportError: return {'condition': True, 'reason': reason} + + +def no_json_schema(): + reason = 'jsonschema is not installed' + try: + import jsonschema # noqa + return {'condition': False, 'reason': reason} + except ImportError: + return {'condition': True, 'reason': reason}