diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 2a4ad6614..4a4e1811a 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -156,7 +156,7 @@ head(sparse_matrix) Create the output `numeric` vector (not as a sparse `Matrix`): ```{r} -output_vector = df[,Improved] == "Marked" +output_vector <- df[,Improved] == "Marked" ``` 1. set `Y` vector to `0`; diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index fecc25a92..64038e02d 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -423,7 +423,7 @@ file.remove("dtrain.buffer") Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data. ```{r getinfo, message=F, warning=F} -label = getinfo(dtest, "label") +label <- getinfo(dtest, "label") pred <- predict(bst, dtest) err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) print(paste("test-error=", err)) diff --git a/demo/data/gen_autoclaims.R b/demo/data/gen_autoclaims.R index 4723c1dd0..5d5e6289f 100644 --- a/demo/data/gen_autoclaims.R +++ b/demo/data/gen_autoclaims.R @@ -8,11 +8,11 @@ library(dummies) library(insuranceData) data(AutoClaims) -data = AutoClaims +data <- AutoClaims -data$STATE = as.factor(data$STATE) -data$CLASS = as.factor(data$CLASS) -data$GENDER = as.factor(data$GENDER) +data$STATE <- as.factor(data$STATE) +data$CLASS <- as.factor(data$CLASS) +data$GENDER <- as.factor(data$GENDER) data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=TRUE); write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F) diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R index a9c462ac7..54e36a73f 100644 --- a/demo/kaggle-higgs/higgs-train.R +++ b/demo/kaggle-higgs/higgs-train.R @@ -23,9 +23,9 @@ param <- list("objective" = "binary:logitraw", "eval_metric" = "ams@0.15", "nthread" = 16) watchlist <- list("train" = xgmat) -nrounds = 120 +nrounds <- 120 print ("loading data end, start to boost trees") -bst = xgb.train(param, xgmat, nrounds, watchlist ); +bst <- xgb.train(param, xgmat, nrounds, watchlist ); # save out model xgb.save(bst, "higgs.model") print ('finish training') diff --git a/demo/kaggle-higgs/speedtest.R b/demo/kaggle-higgs/speedtest.R index a3c30c962..12924aa54 100644 --- a/demo/kaggle-higgs/speedtest.R +++ b/demo/kaggle-higgs/speedtest.R @@ -6,7 +6,7 @@ require(methods) testsize <- 550000 dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001) -dtrain$Label = as.numeric(dtrain$Label=='s') +dtrain$Label <- as.numeric(dtrain$Label=='s') # gbm.time = system.time({ # gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120, # interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1, @@ -24,11 +24,11 @@ sumwpos <- sum(weight * (label==1.0)) sumwneg <- sum(weight * (label==0.0)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) -xgboost.time = list() -threads = c(1,2,4,8,16) +xgboost.time <- list() +threads <- c(1,2,4,8,16) for (i in 1:length(threads)){ - thread = threads[i] - xgboost.time[[i]] = system.time({ + thread <- threads[i] + xgboost.time[[i]] <- system.time({ xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0) param <- list("objective" = "binary:logitraw", "scale_pos_weight" = sumwneg / sumwpos, @@ -38,9 +38,9 @@ for (i in 1:length(threads)){ "eval_metric" = "ams@0.15", "nthread" = thread) watchlist <- list("train" = xgmat) - nrounds = 120 + nrounds <- 120 print ("loading data end, start to boost trees") - bst = xgb.train(param, xgmat, nrounds, watchlist ); + bst <- xgb.train(param, xgmat, nrounds, watchlist ); # save out model xgb.save(bst, "higgs.model") print ('finish training') diff --git a/demo/kaggle-otto/otto_train_pred.R b/demo/kaggle-otto/otto_train_pred.R index 02989db9b..c60a123eb 100644 --- a/demo/kaggle-otto/otto_train_pred.R +++ b/demo/kaggle-otto/otto_train_pred.R @@ -1,20 +1,20 @@ require(xgboost) require(methods) -train = read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE) -test = read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE) -train = train[,-1] -test = test[,-1] +train <- read.csv('data/train.csv',header=TRUE,stringsAsFactors = FALSE) +test <- read.csv('data/test.csv',header=TRUE,stringsAsFactors = FALSE) +train <- train[,-1] +test <- test[,-1] -y = train[,ncol(train)] -y = gsub('Class_','',y) -y = as.integer(y)-1 # xgboost take features in [0,numOfClass) +y <- train[,ncol(train)] +y <- gsub('Class_','',y) +y <- as.integer(y)-1 # xgboost take features in [0,numOfClass) -x = rbind(train[,-ncol(train)],test) -x = as.matrix(x) -x = matrix(as.numeric(x),nrow(x),ncol(x)) -trind = 1:length(y) -teind = (nrow(train)+1):nrow(x) +x <- rbind(train[,-ncol(train)],test) +x <- as.matrix(x) +x <- matrix(as.numeric(x),nrow(x),ncol(x)) +trind <- 1:length(y) +teind <- (nrow(train)+1):nrow(x) # Set necessary parameter param <- list("objective" = "multi:softprob", @@ -23,21 +23,21 @@ param <- list("objective" = "multi:softprob", "nthread" = 8) # Run Cross Validation -cv.nrounds = 50 -bst.cv = xgb.cv(param=param, data = x[trind,], label = y, +cv.nrounds <- 50 +bst.cv <- xgb.cv(param=param, data = x[trind,], label = y, nfold = 3, nrounds=cv.nrounds) # Train the model -nrounds = 50 -bst = xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds) +nrounds <- 50 +bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nrounds) # Make prediction -pred = predict(bst,x[teind,]) -pred = matrix(pred,9,length(pred)/9) -pred = t(pred) +pred <- predict(bst,x[teind,]) +pred <- matrix(pred,9,length(pred)/9) +pred <- t(pred) # Output submission -pred = format(pred, digits=2,scientific=F) # shrink the size of submission -pred = data.frame(1:nrow(pred),pred) -names(pred) = c('id', paste0('Class_',1:9)) +pred <- format(pred, digits=2,scientific=F) # shrink the size of submission +pred <- data.frame(1:nrow(pred),pred) +names(pred) <- c('id', paste0('Class_',1:9)) write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE) diff --git a/demo/kaggle-otto/understandingXGBoostModel.Rmd b/demo/kaggle-otto/understandingXGBoostModel.Rmd index 0f4f82bc0..ac8898f48 100644 --- a/demo/kaggle-otto/understandingXGBoostModel.Rmd +++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd @@ -127,7 +127,7 @@ param <- list("objective" = "multi:softprob", cv.nrounds <- 5 cv.nfold <- 3 -bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, +bst.cv <- xgb.cv(param=param, data = trainMatrix, label = y, nfold = cv.nfold, nrounds = cv.nrounds) ``` > As we can see the error rate is low on the test dataset (for a 5mn trained model). @@ -135,8 +135,8 @@ bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, Finally, we are ready to train the real model!!! ```{r modelTraining} -nrounds = 50 -bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds) +nrounds <- 50 +bst <- xgboost(param=param, data = trainMatrix, label = y, nrounds=nrounds) ``` Model understanding diff --git a/demo/multiclass_classification/train.R b/demo/multiclass_classification/train.R index 4a07f278d..a9ded8748 100644 --- a/demo/multiclass_classification/train.R +++ b/demo/multiclass_classification/train.R @@ -26,7 +26,7 @@ test_x <- test[, 1:34] test_y <- test[, V35] xg_train <- xgb.DMatrix(data = as.matrix(train_x), label = train_y) -xg_test = xgb.DMatrix(as.matrix(test_x), label = test_y) +xg_test <- xgb.DMatrix(as.matrix(test_x), label = test_y) params <- list( objective = 'multi:softmax', @@ -36,7 +36,7 @@ params <- list( eta = 0.1 ) -watchlist = list(train = xg_train, test = xg_test) +watchlist <- list(train = xg_train, test = xg_test) bst <- xgb.train( params = params, @@ -60,5 +60,5 @@ pred_mat <- matrix(pred_prob, ncol = 6, byrow = TRUE) # rowSums(pred_mat) pred_label <- apply(pred_mat, 1, which.max) - 1L -error_rate = sum(pred_label != test_y) / length(test_y) +error_rate <- sum(pred_label != test_y) / length(test_y) print(paste("Test error using softprob =", error_rate))