Example JSON model parser and Schema. (#5137)

This commit is contained in:
Jiaming Yuan 2019-12-23 19:47:35 +08:00 committed by GitHub
parent a4b929385e
commit 1d0ca49761
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 655 additions and 15 deletions

2
.gitignore vendored
View File

@ -17,7 +17,7 @@
*.tar.gz
*conf
*buffer
*model
*.model
*pyc
*.train
*.test

View File

@ -0,0 +1,3 @@
We introduced initial support for saving XGBoost model in JSON format in 1.0.0. Note that
it's still experimental and under development, output schema is subject to change due to
bug fixes or further refactoring. For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html .

View File

@ -0,0 +1,180 @@
'''Demonstration for parsing JSON tree model file generated by XGBoost. The
support is experimental, output schema is subject to change in the future.
'''
import json
import argparse
class Tree:
'''A tree built by XGBoost.'''
# Index into node array
_left = 0
_right = 1
_parent = 2
_ind = 3
_cond = 4
_default_left = 5
# Index into stat array
_loss_chg = 0
_sum_hess = 1
_base_weight = 2
_child_cnt = 3
def __init__(self, tree_id: int, nodes, stats):
self.tree_id = tree_id
self.nodes = nodes
self.stats = stats
def loss_change(self, node_id: int):
'''Loss gain of a node.'''
return self.stats[node_id][self._loss_chg]
def sum_hessian(self, node_id: int):
'''Sum Hessian of a node.'''
return self.stats[node_id][self._sum_hess]
def base_weight(self, node_id: int):
'''Base weight of a node.'''
return self.stats[node_id][self._base_weight]
def num_children(self, node_id: int):
'''Number of children of a node.'''
return self.stats[node_id][self._child_cnt]
def split_index(self, node_id: int):
'''Split feature index of node.'''
return self.nodes[node_id][self._ind]
def split_condition(self, node_id: int):
'''Split value of a node.'''
return self.nodes[node_id][self._cond]
def parent(self, node_id: int):
'''Parent ID of a node.'''
return self.nodes[node_id][self._parent]
def left_child(self, node_id: int):
'''Left child ID of a node.'''
return self.nodes[node_id][self._left]
def right_child(self, node_id: int):
'''Right child ID of a node.'''
return self.nodes[node_id][self._right]
def is_leaf(self, node_id: int):
'''Whether a node is leaf.'''
return self.nodes[node_id][self._left] == -1
def is_deleted(self, node_id: int):
'''Whether a node is deleted.'''
# std::numeric_limits<uint32_t>::max()
return self.nodes[node_id][self._ind] == 4294967295
def __str__(self):
stacks = [0]
nodes = []
while stacks:
node = {}
nid = stacks.pop()
node['node id'] = nid
node['gain'] = self.loss_change(nid)
node['cover'] = self.sum_hessian(nid)
nodes.append(node)
if not self.is_leaf(nid) and not self.is_deleted(nid):
left = self.left_child(nid)
right = self.right_child(nid)
stacks.append(left)
stacks.append(right)
string = '\n'.join(map(lambda x: ' ' + str(x), nodes))
return string
class Model:
'''Gradient boosted tree model.'''
def __init__(self, m: dict):
'''Construct the Model from JSON object.
parameters
----------
m: A dictionary loaded by json
'''
# Basic property of a model
self.learner_model_shape = model['learner']['learner_model_param']
self.num_output_group = int(self.learner_model_shape['num_class'])
self.num_feature = int(self.learner_model_shape['num_feature'])
self.base_score = float(self.learner_model_shape['base_score'])
# A field encoding which output group a tree belongs
self.tree_info = model['learner']['gradient_booster']['model'][
'tree_info']
model_shape = model['learner']['gradient_booster']['model'][
'gbtree_model_param']
# JSON representation of trees
j_trees = model['learner']['gradient_booster']['model']['trees']
# Load the trees
self.num_trees = int(model_shape['num_trees'])
self.leaf_size = int(model_shape['size_leaf_vector'])
# Right now XGBoost doesn't support vector leaf yet
assert self.leaf_size == 0, str(self.leaf_size)
trees = []
for i in range(self.num_trees):
tree = j_trees[i]
tree_id = int(tree['id'])
assert tree_id == i, (tree_id, i)
# properties
left_children = tree['left_children']
right_children = tree['right_children']
parents = tree['parents']
split_conditions = tree['split_conditions']
split_indices = tree['split_indices']
default_left = tree['default_left']
# stats
base_weights = tree['base_weights']
loss_changes = tree['loss_changes']
sum_hessian = tree['sum_hessian']
leaf_child_counts = tree['leaf_child_counts']
stats = []
nodes = []
# We resemble the structure used inside XGBoost, which is similar
# to adjacency list.
for node_id in range(len(left_children)):
nodes.append([
left_children[node_id], right_children[node_id],
parents[node_id], split_indices[node_id],
split_conditions[node_id], default_left[node_id]
])
stats.append([
loss_changes[node_id], sum_hessian[node_id],
base_weights[node_id], leaf_child_counts[node_id]
])
tree = Tree(tree_id, nodes, stats)
trees.append(tree)
self.trees = trees
def print_model(self):
for i, tree in enumerate(self.trees):
print('tree_id:', i)
print(tree)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Demonstration for loading and printing XGBoost model.')
parser.add_argument('--model',
type=str,
required=True,
help='Path to JSON model file.')
args = parser.parse_args()
with open(args.model, 'r') as fd:
model = json.load(fd)
model = Model(model)
model.print_model()

413
doc/model.schema Normal file
View File

@ -0,0 +1,413 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"gbtree_model_param": {
"type": "object",
"properties": {
"num_trees": {
"type": "string"
},
"size_leaf_vector": {
"type": "string"
}
},
"required": [
"num_trees",
"size_leaf_vector"
]
},
"tree_param": {
"type": "object",
"properties": {
"num_nodes": {
"type": "string"
},
"size_leaf_vector": {
"type": "string"
},
"num_feature": {
"type": "string"
}
},
"required": [
"num_nodes",
"num_feature",
"size_leaf_vector"
]
},
"reg_loss_param": {
"type": "object",
"properties": {
"scale_pos_weight": {
"type": "string"
}
}
},
"softmax_multiclass_param": {
"type": "object",
"properties": {
"num_class": { "type": "string" }
}
},
"lambda_rank_param": {
"type": "object",
"properties": {
"num_pairsample": { "type": "string" },
"fix_list_weight": { "type": "string" }
}
}
},
"type": "object",
"properties": {
"version": {
"type": "array",
"const": [
1,
0,
0
],
"additionalItems": false
},
"learner": {
"type": "object",
"properties": {
"gradient_booster": {
"oneOf": [
{
"type": "object",
"properties": {
"name": {
"const": "gbtree"
},
"model": {
"type": "object",
"properties": {
"gbtree_model_param": {
"$ref": "#/definitions/gbtree_model_param"
},
"trees": {
"type": "array",
"items": {
"type": "object",
"properties": {
"tree_param": {
"type": "object",
"properties": {
"num_nodes": {
"type": "string"
},
"size_leaf_vector": {
"type": "string"
},
"num_feature": {
"type": "string"
}
},
"required": [
"num_nodes",
"num_feature",
"size_leaf_vector"
]
},
"id": {
"type": "integer"
},
"loss_changes": {
"type": "array",
"items": {
"type": "number"
}
},
"sum_hessian": {
"type": "array",
"items": {
"type": "number"
}
},
"base_weights": {
"type": "array",
"items": {
"type": "number"
}
},
"leaf_child_counts": {
"type": "array",
"items": {
"type": "integer"
}
},
"left_children": {
"type": "array",
"items": {
"type": "integer"
}
},
"right_children": {
"type": "array",
"items": {
"type": "integer"
}
},
"parents": {
"type": "array",
"items": {
"type": "integer"
}
},
"split_indices": {
"type": "array",
"items": {
"type": "integer"
}
},
"split_conditions": {
"type": "array",
"items": {
"type": "number"
}
},
"default_left": {
"type": "array",
"items": {
"type": "boolean"
}
}
},
"required": [
"tree_param",
"loss_changes",
"sum_hessian",
"base_weights",
"leaf_child_counts",
"left_children",
"right_children",
"parents",
"split_indices",
"split_conditions",
"default_left"
]
}
},
"tree_info": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": [
"gbtree_model_param",
"trees"
]
}
},
"required": [
"name",
"model"
]
},
{
"type": "object",
"properties": {
"name": { "const": "gblinear" },
"model": {
"type": "object",
"properties": {
"weights": {
"type": "array",
"items": {
"type": "number"
}
}
}
}
}
}
]
},
"objective": {
"oneOf": [
{
"type": "object",
"properties": {
"name": { "const": "reg:squarederror" },
"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
},
"required": [
"name",
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "reg:squaredlogerror" },
"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
},
"required": [
"name",
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "reg:logistic" },
"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
},
"required": [
"name",
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "binary:logistic" },
"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
},
"required": [
"name",
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "binary:logitraw" },
"reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
},
"required": [
"name",
"reg_loss_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "count:poisson" },
"poisson_regression_param": {
"type": "object",
"properties": {
"max_delta_step": { "type": "string" }
}
}
},
"required": [
"name",
"poisson_regression_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "reg:tweedie" },
"tweedie_regression_param": {
"type": "object",
"properties": {
"tweedie_variance_power": { "type": "string" }
}
}
},
"required": [
"name",
"tweedie_regression_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "survival:cox" }
},
"required": [ "name" ]
},
{
"type": "object",
"properties": {
"name": { "const": "reg:gamma" }
},
"required": [ "name" ]
},
{
"type": "object",
"properties": {
"name": { "const": "multi:softprob" },
"softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
},
"required": [
"name",
"softmax_multiclass_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "multi:softmax" },
"softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
},
"required": [
"name",
"softmax_multiclass_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "rank:pairwise" },
"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
},
"required": [
"name",
"lambda_rank_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "rank:ndcg" },
"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
},
"required": [
"name",
"lambda_rank_param"
]
},
{
"type": "object",
"properties": {
"name": { "const": "rank:map" },
"lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
},
"required": [
"name",
"lambda_rank_param"
]
}
]
},
"learner_model_param": {
"type": "object",
"properties": {
"base_score": { "type": "string" },
"num_class": { "type": "string" },
"num_feature": { "type": "string" }
}
}
},
"required": [
"gradient_booster",
"objective"
]
}
},
"required": [
"version",
"learner"
]
}

View File

@ -191,5 +191,16 @@ Future Plans
Right now using the JSON format incurs longer serialisation time, we have been working on
optimizing the JSON implementation to close the gap between binary format and JSON format.
You can track the progress in `#5046 <https://github.com/dmlc/xgboost/pull/5046>`_.
Another important item for JSON format support is a stable and documented `schema
<https://json-schema.org/>`_, based on which one can easily reuse the saved model.
***********
JSON Schema
***********
Another important feature of JSON format is a documented `Schema
<https://json-schema.org/>`_, based on which one can easily reuse the output model from
XGBoost. Here is the initial draft of JSON schema for the output model (not
serialization, which will not be stable as noted above). It's subject to change due to
the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``.
.. include:: ../model.schema
:code: json

View File

@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10
# Install Python packages
RUN \
pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh recommonmark guzzle_sphinx_theme mock \
breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 && \
breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 jsonschema && \
pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
pip install "dask[complete]"

View File

@ -3,6 +3,8 @@ import xgboost as xgb
import unittest
import os
import json
import testing as tm
import pytest
dpath = 'demo/data/'
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
@ -11,6 +13,20 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
rng = np.random.RandomState(1994)
def json_model(model_path):
X = np.random.random((10, 3))
y = np.random.randint(2, size=(10,))
dm1 = xgb.DMatrix(X, y)
bst = xgb.train({'tree_method': 'hist'}, dm1)
bst.save_model(model_path)
with open(model_path, 'r') as fd:
model = json.load(fd)
return model
class TestModels(unittest.TestCase):
def test_glm(self):
param = {'verbosity': 0, 'objective': 'binary:logistic',
@ -42,8 +58,9 @@ class TestModels(unittest.TestCase):
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
model_path = 'xgb.model.dart'
# save model
bst.save_model('xgb.model.dart')
bst.save_model(model_path)
# load model and data in
bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
dtest2 = xgb.DMatrix('dtest.buffer')
@ -69,6 +86,7 @@ class TestModels(unittest.TestCase):
for ii in range(len(preds_list)):
for jj in range(ii + 1, len(preds_list)):
assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
os.remove(model_path)
def test_eta_decay(self):
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@ -204,21 +222,27 @@ class TestModels(unittest.TestCase):
bst.predict(dm2) # success
def test_model_json_io(self):
X = np.random.random((10, 3))
y = np.random.randint(2, size=(10,))
dm1 = xgb.DMatrix(X, y)
bst = xgb.train({'tree_method': 'hist'}, dm1)
bst.save_model('./model.json')
with open('./model.json', 'r') as fd:
j_model = json.load(fd)
model_path = './model.json'
j_model = json_model(model_path)
assert isinstance(j_model['learner'], dict)
bst = xgb.Booster(model_file='./model.json')
bst.save_model(fname=model_path)
with open('./model.json', 'r') as fd:
j_model = json.load(fd)
assert isinstance(j_model['learner'], dict)
os.remove('model.json')
os.remove(model_path)
@pytest.mark.skipif(**tm.no_json_schema())
def test_json_schema(self):
import jsonschema
model_path = './model.json'
path = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
doc = os.path.join(path, 'doc', 'model.schema')
with open(doc, 'r') as fd:
schema = json.load(fd)
jsonschema.validate(instance=json_model(model_path), schema=schema)
os.remove(model_path)

View File

@ -55,3 +55,12 @@ def no_dask_cudf():
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_json_schema():
reason = 'jsonschema is not installed'
try:
import jsonschema # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}