65 lines
1.5 KiB
R
65 lines
1.5 KiB
R
library(data.table)
|
|
library(xgboost)
|
|
|
|
if (!file.exists("./dermatology.data")) {
|
|
download.file(
|
|
"https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data",
|
|
"dermatology.data",
|
|
method = "curl"
|
|
)
|
|
}
|
|
|
|
df <- fread("dermatology.data", sep = ",", header = FALSE)
|
|
|
|
df[, `:=`(V34 = as.integer(ifelse(V34 == "?", 0L, V34)),
|
|
V35 = V35 - 1L)]
|
|
|
|
idx <- sample(nrow(df), size = round(0.7 * nrow(df)), replace = FALSE)
|
|
|
|
train <- df[idx, ]
|
|
test <- df[-idx, ]
|
|
|
|
train_x <- train[, 1:34]
|
|
train_y <- train[, V35]
|
|
|
|
test_x <- test[, 1:34]
|
|
test_y <- test[, V35]
|
|
|
|
xg_train <- xgb.DMatrix(data = as.matrix(train_x), label = train_y)
|
|
xg_test <- xgb.DMatrix(as.matrix(test_x), label = test_y)
|
|
|
|
params <- list(
|
|
objective = 'multi:softmax',
|
|
num_class = 6,
|
|
max_depth = 6,
|
|
nthread = 4,
|
|
eta = 0.1
|
|
)
|
|
|
|
watchlist <- list(train = xg_train, test = xg_test)
|
|
|
|
bst <- xgb.train(
|
|
params = params,
|
|
data = xg_train,
|
|
watchlist = watchlist,
|
|
nrounds = 5
|
|
)
|
|
|
|
pred <- predict(bst, xg_test)
|
|
error_rate <- sum(pred != test_y) / length(test_y)
|
|
print(paste("Test error using softmax =", error_rate))
|
|
|
|
# do the same thing again, but output probabilities
|
|
params$objective <- 'multi:softprob'
|
|
bst <- xgb.train(params, xg_train, nrounds = 5, watchlist)
|
|
|
|
pred_prob <- predict(bst, xg_test)
|
|
|
|
pred_mat <- matrix(pred_prob, ncol = 6, byrow = TRUE)
|
|
# validation
|
|
# rowSums(pred_mat)
|
|
|
|
pred_label <- apply(pred_mat, 1, which.max) - 1L
|
|
error_rate <- sum(pred_label != test_y) / length(test_y)
|
|
print(paste("Test error using softprob =", error_rate))
|