From 3e930e4f2d807f63f30be1d743bf5d3e4906a0cf Mon Sep 17 00:00:00 2001 From: ras44 <9282281+ras44@users.noreply.github.com> Date: Wed, 15 May 2019 03:35:44 +0200 Subject: [PATCH] added JSON vignette (#4439) --- R-package/vignettes/xgboostfromJSON.Rmd | 189 ++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 R-package/vignettes/xgboostfromJSON.Rmd diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd new file mode 100644 index 000000000..046902b0f --- /dev/null +++ b/R-package/vignettes/xgboostfromJSON.Rmd @@ -0,0 +1,189 @@ +--- +title: "XGBoost from JSON" +output: + rmarkdown::html_vignette: + number_sections: yes + toc: yes +author: Roland Stevenson +vignette: > + %\VignetteIndexEntry{XGBoost from JSON} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +XGBoost from JSON +================= + +## Introduction + +The purpose of this Vignette is to show you how to correctly load and work with an **Xgboost** model that has been dumped to JSON. **Xgboost** internally converts all data to [32-bit floats](https://en.wikipedia.org/wiki/Single-precision_floating-point_format), and the values dumped to JSON are decimal representations of these values. When working with a model that has been parsed from a JSON file, care must be taken to correctly treat: + +- the input data, which should be converted to 32-bit floats +- any 32-bit floats that were stored in JSON as decimal representations +- any calculations must be done with 32-bit mathematical operators + +## Setup + +For the purpose of this tutorial we will load the xgboost, jsonlite, and float packages. We'll also set `digits=22` in our options in case we want to inspect many digits of our results. + +```{r} +require(xgboost) +require(jsonlite) +require(float) +options(digits=22) +``` + +We will create a toy binary logistic model based on the example first provided [here](https://github.com/dmlc/xgboost/issues/3960), so that we can easily understand the structure of the dumped JSON model object. This will allow us to understand where discrepancies can occur and how they should be handled. + +```{r} +dates <- c(20180130, 20180130, 20180130, + 20180130, 20180130, 20180130, + 20180131, 20180131, 20180131, + 20180131, 20180131, 20180131, + 20180131, 20180131, 20180131, + 20180134, 20180134, 20180134) + +labels <- c(1, 1, 1, + 1, 1, 1, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0) + +data <- data.frame(dates = dates, labels=labels) + +bst <- xgboost( + data = as.matrix(data$dates), + label = labels, + nthread = 2, + nrounds = 1, + objective = "binary:logistic", + missing = NA, + max_depth = 1 +) +``` + +## Comparing results +We will now dump the model to JSON and attempt to illustrate a variety of issues that can arise, and how to properly deal with them. + +First let's dump the model to JSON: + +```{r} +bst_json <- xgb.dump(bst, with_stats = FALSE, dump_format='json') +bst_from_json <- jsonlite::fromJSON(bst_json, simplifyDataFrame = FALSE) +node <- bst_from_json[[1]] +cat(bst_json) +``` + +The tree JSON shown by the above code-chunk tells us that if the data is less than 20180132, the tree will output the value in the first leaf. Otherwise it will output the value in the second leaf. Let's try to reproduce this manually with the data we have and confirm that it matches the model predictions we've already calculated. + +```{r} +bst_preds_logodds <- predict(bst,as.matrix(data$dates), outputmargin = TRUE) + +# calculate the logodds values using the JSON representation +bst_from_json_logodds <- ifelse(data$dates When working with imported JSON, all data must be converted to 32-bit floats + +To explain this, let's repeat the comparison and round to two decimals: + +```{r} +round(bst_preds_logodds,2) == round(bst_from_json_logodds,2) +``` + +If we round to two decimals, we see that only the elements related to data values of `20180131` don't agree. If we convert the data to floats, they agree: + +```{r} +# now convert the dates to floats first +bst_from_json_logodds <- ifelse(fl(data$dates) All JSON parameters stored as floats must be converted to floats. + +Let's now say we do care about numbers past the first two decimals. + +```{r} +# test that values are equal +bst_preds_logodds == bst_from_json_logodds +``` + +None are exactly equal. What happened? Although we've converted the data to 32-bit floats, we also need to convert the JSON parameters to 32-bit floats. Let's do this: + +```{r} +# now convert the dates to floats first +bst_from_json_logodds <- ifelse(fl(data$dates) Always use 32-bit numbers and operators + +We were able to get the log-odds to agree, so now let's manually calculate the sigmoid of the log-odds. This should agree with the xgboost predictions. + + +```{r} +bst_preds <- predict(bst,as.matrix(data$dates)) + +# calculate the predictions casting doubles to floats +bst_from_json_preds <- ifelse(fl(data$dates)