[R] replace uses of T and F with TRUE and FALSE (#5778)
* [R-package] replace uses of T and F with TRUE and FALSE * enable linting * Remove skip Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
parent
cb7f7e542c
commit
c35be9dc40
@ -100,7 +100,7 @@ print(paste("test-error=", err))
|
|||||||
|
|
||||||
# You can dump the tree you learned using xgb.dump into a text file
|
# You can dump the tree you learned using xgb.dump into a text file
|
||||||
dump_path = file.path(tempdir(), 'dump.raw.txt')
|
dump_path = file.path(tempdir(), 'dump.raw.txt')
|
||||||
xgb.dump(bst, dump_path, with_stats = T)
|
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||||
|
|
||||||
# Finally, you can check which features are the most important.
|
# Finally, you can check which features are the most important.
|
||||||
print("Most important features (look at column Gain):")
|
print("Most important features (look at column Gain):")
|
||||||
|
|||||||
@ -9,7 +9,7 @@ require(e1071)
|
|||||||
# Load Arthritis dataset in memory.
|
# Load Arthritis dataset in memory.
|
||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||||
df <- data.table(Arthritis, keep.rownames = F)
|
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||||
|
|
||||||
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||||
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||||
|
|||||||
@ -19,7 +19,7 @@ if (!require(vcd)) {
|
|||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
|
|
||||||
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||||
df <- data.table(Arthritis, keep.rownames = F)
|
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||||
|
|
||||||
# Let's have a look to the data.table
|
# Let's have a look to the data.table
|
||||||
cat("Print the dataset\n")
|
cat("Print the dataset\n")
|
||||||
|
|||||||
@ -19,18 +19,18 @@ treeInteractions <- function(input_tree, input_max_depth){
|
|||||||
setorderv(parents_left, 'ID_merge')
|
setorderv(parents_left, 'ID_merge')
|
||||||
setorderv(parents_right, 'ID_merge')
|
setorderv(parents_right, 'ID_merge')
|
||||||
|
|
||||||
trees <- merge(trees, parents_left, by='ID_merge', all.x=T)
|
trees <- merge(trees, parents_left, by='ID_merge', all.x=TRUE)
|
||||||
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
||||||
trees[, c('i.id','i.feature'):=NULL]
|
trees[, c('i.id','i.feature'):=NULL]
|
||||||
|
|
||||||
trees <- merge(trees, parents_right, by='ID_merge', all.x=T)
|
trees <- merge(trees, parents_right, by='ID_merge', all.x=TRUE)
|
||||||
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)]
|
||||||
trees[, c('i.id','i.feature'):=NULL]
|
trees[, c('i.id','i.feature'):=NULL]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extract nodes with interactions
|
# Extract nodes with interactions
|
||||||
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1),
|
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1),
|
||||||
c('Feature',paste0('parent_feat_',1:(input_max_depth-1))), with=F]
|
c('Feature',paste0('parent_feat_',1:(input_max_depth-1))), with=FALSE]
|
||||||
interaction_trees_split <- split(interaction_trees, 1:nrow(interaction_trees))
|
interaction_trees_split <- split(interaction_trees, 1:nrow(interaction_trees))
|
||||||
interaction_list <- lapply(interaction_trees_split, as.character)
|
interaction_list <- lapply(interaction_trees_split, as.character)
|
||||||
|
|
||||||
@ -96,7 +96,7 @@ x1 <- sort(unique(x[['V1']]))
|
|||||||
for (i in 1:length(x1)){
|
for (i in 1:length(x1)){
|
||||||
testdata <- copy(x[, -c('V1')])
|
testdata <- copy(x[, -c('V1')])
|
||||||
testdata[['V1']] <- x1[i]
|
testdata[['V1']] <- x1[i]
|
||||||
testdata <- testdata[, paste0('V',1:10), with=F]
|
testdata <- testdata[, paste0('V',1:10), with=FALSE]
|
||||||
pred <- predict(bst3, as.matrix(testdata))
|
pred <- predict(bst3, as.matrix(testdata))
|
||||||
|
|
||||||
# Should not print out anything due to monotonic constraints
|
# Should not print out anything due to monotonic constraints
|
||||||
|
|||||||
@ -13,7 +13,7 @@ exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_Y
|
|||||||
# retains the missing values
|
# retains the missing values
|
||||||
# NOTE: this dataset is comes ready out of the box
|
# NOTE: this dataset is comes ready out of the box
|
||||||
options(na.action = 'na.pass')
|
options(na.action = 'na.pass')
|
||||||
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
|
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE])
|
||||||
options(na.action = 'na.omit')
|
options(na.action = 'na.omit')
|
||||||
|
|
||||||
# response
|
# response
|
||||||
|
|||||||
@ -12,7 +12,7 @@ flag_32bit = .Machine$sizeof.pointer != 8
|
|||||||
|
|
||||||
set.seed(1982)
|
set.seed(1982)
|
||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
df <- data.table(Arthritis, keep.rownames = F)
|
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||||
df[,AgeDiscret := as.factor(round(Age / 10,0))]
|
df[,AgeDiscret := as.factor(round(Age / 10,0))]
|
||||||
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||||
df[,ID := NULL]
|
df[,ID := NULL]
|
||||||
@ -47,7 +47,7 @@ test_that("xgb.dump works", {
|
|||||||
if (!flag_32bit)
|
if (!flag_32bit)
|
||||||
expect_length(xgb.dump(bst.Tree), 200)
|
expect_length(xgb.dump(bst.Tree), 200)
|
||||||
dump_file = file.path(tempdir(), 'xgb.model.dump')
|
dump_file = file.path(tempdir(), 'xgb.model.dump')
|
||||||
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = T))
|
expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE))
|
||||||
expect_true(file.exists(dump_file))
|
expect_true(file.exists(dump_file))
|
||||||
expect_gt(file.size(dump_file), 8000)
|
expect_gt(file.size(dump_file), 8000)
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
|||||||
objective = "reg:squarederror",
|
objective = "reg:squarederror",
|
||||||
eval_metric = "rmse"),
|
eval_metric = "rmse"),
|
||||||
if (booster == "dart")
|
if (booster == "dart")
|
||||||
list(rate_drop = .01, one_drop = T)),
|
list(rate_drop = .01, one_drop = TRUE)),
|
||||||
data = d,
|
data = d,
|
||||||
label = y,
|
label = y,
|
||||||
nrounds = nrounds)
|
nrounds = nrounds)
|
||||||
@ -168,8 +168,8 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
|||||||
pr <- function(...)
|
pr <- function(...)
|
||||||
predict(fit, newdata = d, ...)
|
predict(fit, newdata = d, ...)
|
||||||
pred <- pr()
|
pred <- pr()
|
||||||
shap <- pr(predcontrib = T)
|
shap <- pr(predcontrib = TRUE)
|
||||||
shapi <- pr(predinteraction = T)
|
shapi <- pr(predinteraction = TRUE)
|
||||||
tol = 1e-5
|
tol = 1e-5
|
||||||
|
|
||||||
expect_equal(rowSums(shap), pred, tol = tol)
|
expect_equal(rowSums(shap), pred, tol = tol)
|
||||||
|
|||||||
@ -107,7 +107,7 @@ test_that("SHAP contribution values are not NAN", {
|
|||||||
|
|
||||||
shaps <- as.data.frame(predict(fit,
|
shaps <- as.data.frame(predict(fit,
|
||||||
newdata = as.matrix(subset(d, fold == 1)[, ivs]),
|
newdata = as.matrix(subset(d, fold == 1)[, ivs]),
|
||||||
predcontrib = T))
|
predcontrib = TRUE))
|
||||||
result <- cbind(shaps, sum = rowSums(shaps), pred = predict(fit,
|
result <- cbind(shaps, sum = rowSums(shaps), pred = predict(fit,
|
||||||
newdata = as.matrix(subset(d, fold == 1)[, ivs])))
|
newdata = as.matrix(subset(d, fold == 1)[, ivs])))
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,6 @@
|
|||||||
context("Code is of high quality and lint free")
|
context("Code is of high quality and lint free")
|
||||||
test_that("Code Lint", {
|
test_that("Code Lint", {
|
||||||
skip_on_cran()
|
skip_on_cran()
|
||||||
skip_on_travis()
|
|
||||||
skip_if_not_installed("lintr")
|
|
||||||
my_linters <- list(
|
my_linters <- list(
|
||||||
absolute_paths_linter=lintr::absolute_paths_linter,
|
absolute_paths_linter=lintr::absolute_paths_linter,
|
||||||
assignment_linter=lintr::assignment_linter,
|
assignment_linter=lintr::assignment_linter,
|
||||||
@ -21,7 +19,8 @@ test_that("Code Lint", {
|
|||||||
spaces_inside_linter=lintr::spaces_inside_linter,
|
spaces_inside_linter=lintr::spaces_inside_linter,
|
||||||
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
|
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
|
||||||
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
|
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
|
||||||
trailing_whitespace_linter=lintr::trailing_whitespace_linter
|
trailing_whitespace_linter=lintr::trailing_whitespace_linter,
|
||||||
|
true_false=lintr::T_and_F_symbol_linter
|
||||||
)
|
)
|
||||||
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
|
lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
|
||||||
})
|
})
|
||||||
|
|||||||
@ -63,7 +63,7 @@ The first step is to load `Arthritis` dataset in memory and wrap it with `data.t
|
|||||||
|
|
||||||
```{r, results='hide'}
|
```{r, results='hide'}
|
||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
df <- data.table(Arthritis, keep.rownames = F)
|
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||||
```
|
```
|
||||||
|
|
||||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||||
|
|||||||
@ -363,7 +363,7 @@ xgb.plot.importance(importance_matrix = importance_matrix)
|
|||||||
You can dump the tree you learned using `xgb.dump` into a text file.
|
You can dump the tree you learned using `xgb.dump` into a text file.
|
||||||
|
|
||||||
```{r dump, message=T, warning=F}
|
```{r dump, message=T, warning=F}
|
||||||
xgb.dump(bst, with_stats = T)
|
xgb.dump(bst, with_stats = TRUE)
|
||||||
```
|
```
|
||||||
|
|
||||||
You can plot the trees from your model using ```xgb.plot.tree``
|
You can plot the trees from your model using ```xgb.plot.tree``
|
||||||
|
|||||||
@ -14,5 +14,5 @@ data$STATE = as.factor(data$STATE)
|
|||||||
data$CLASS = as.factor(data$CLASS)
|
data$CLASS = as.factor(data$CLASS)
|
||||||
data$GENDER = as.factor(data$GENDER)
|
data$GENDER = as.factor(data$GENDER)
|
||||||
|
|
||||||
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=T);
|
data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE);
|
||||||
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(methods)
|
require(methods)
|
||||||
|
|
||||||
train = read.csv('data/train.csv',header=TRUE,stringsAsFactors = F)
|
train = read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||||
test = read.csv('data/test.csv',header=TRUE,stringsAsFactors = F)
|
test = read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE)
|
||||||
train = train[,-1]
|
train = train[,-1]
|
||||||
test = test[,-1]
|
test = test[,-1]
|
||||||
|
|
||||||
|
|||||||
@ -30,8 +30,8 @@ require(xgboost)
|
|||||||
require(methods)
|
require(methods)
|
||||||
require(data.table)
|
require(data.table)
|
||||||
require(magrittr)
|
require(magrittr)
|
||||||
train <- fread('data/train.csv', header = T, stringsAsFactors = F)
|
train <- fread('data/train.csv', header = T, stringsAsFactors = FALSE)
|
||||||
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
|
test <- fread('data/test.csv', header=TRUE, stringsAsFactors = FALSE)
|
||||||
```
|
```
|
||||||
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
> `magrittr` and `data.table` are here to make the code cleaner and much more rapid.
|
||||||
|
|
||||||
@ -42,13 +42,13 @@ Let's explore the dataset.
|
|||||||
dim(train)
|
dim(train)
|
||||||
|
|
||||||
# Training content
|
# Training content
|
||||||
train[1:6,1:5, with =F]
|
train[1:6,1:5, with =FALSE]
|
||||||
|
|
||||||
# Test dataset dimensions
|
# Test dataset dimensions
|
||||||
dim(test)
|
dim(test)
|
||||||
|
|
||||||
# Test content
|
# Test content
|
||||||
test[1:6,1:5, with =F]
|
test[1:6,1:5, with =FALSE]
|
||||||
```
|
```
|
||||||
> We only display the 6 first rows and 5 first columns for convenience
|
> We only display the 6 first rows and 5 first columns for convenience
|
||||||
|
|
||||||
@ -70,7 +70,7 @@ According to its description, the **Otto** challenge is a multi class classifica
|
|||||||
|
|
||||||
```{r searchLabel}
|
```{r searchLabel}
|
||||||
# Check the content of the last column
|
# Check the content of the last column
|
||||||
train[1:6, ncol(train), with = F]
|
train[1:6, ncol(train), with = FALSE]
|
||||||
# Save the name of the last column
|
# Save the name of the last column
|
||||||
nameLastCol <- names(train)[ncol(train)]
|
nameLastCol <- names(train)[ncol(train)]
|
||||||
```
|
```
|
||||||
@ -86,7 +86,7 @@ For that purpose, we will:
|
|||||||
|
|
||||||
```{r classToIntegers}
|
```{r classToIntegers}
|
||||||
# Convert from classes to numbers
|
# Convert from classes to numbers
|
||||||
y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
|
y <- train[, nameLastCol, with = FALSE][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
|
||||||
|
|
||||||
# Display the first 5 levels
|
# Display the first 5 levels
|
||||||
y[1:5]
|
y[1:5]
|
||||||
@ -95,7 +95,7 @@ y[1:5]
|
|||||||
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
We remove label column from training dataset, otherwise **XGBoost** would use it to guess the labels!
|
||||||
|
|
||||||
```{r deleteCols, results='hide'}
|
```{r deleteCols, results='hide'}
|
||||||
train[, nameLastCol:=NULL, with = F]
|
train[, nameLastCol:=NULL, with = FALSE]
|
||||||
```
|
```
|
||||||
|
|
||||||
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
`data.table` is an awesome implementation of data.frame, unfortunately it is not a format supported natively by **XGBoost**. We need to convert both datasets (training and test) in `numeric` Matrix format.
|
||||||
@ -163,7 +163,7 @@ Each *split* is done on one feature only at one value.
|
|||||||
Let's see what the model looks like.
|
Let's see what the model looks like.
|
||||||
|
|
||||||
```{r modelDump}
|
```{r modelDump}
|
||||||
model <- xgb.dump(bst, with.stats = T)
|
model <- xgb.dump(bst, with.stats = TRUE)
|
||||||
model[1:10]
|
model[1:10]
|
||||||
```
|
```
|
||||||
> For convenience, we are displaying the first 10 lines of the model only.
|
> For convenience, we are displaying the first 10 lines of the model only.
|
||||||
|
|||||||
@ -52,7 +52,7 @@ The first step is to load `Arthritis` dataset in memory and wrap it with `data.t
|
|||||||
|
|
||||||
```r
|
```r
|
||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
df <- data.table(Arthritis, keep.rownames = F)
|
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||||
```
|
```
|
||||||
|
|
||||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||||
|
|||||||
@ -489,7 +489,7 @@ You can dump the tree you learned using `xgb.dump` into a text file.
|
|||||||
|
|
||||||
|
|
||||||
```r
|
```r
|
||||||
xgb.dump(bst, with_stats = T)
|
xgb.dump(bst, with_stats = TRUE)
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user