From c3d8f21df339564e5b43bb5f42c41d12c46d8eec Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 31 Dec 2014 00:52:53 +0100 Subject: [PATCH 1/5] change assignation sign --- R-package/demo/create_sparse_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index b697def97..a333f3ac0 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -60,6 +60,6 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 3, xgb.dump(bst, 'xgb.model.dump', with.stats = T) # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. -importance = xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') +importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') print(importance) # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is having received a Placebo or not. From 4cc3790b76df514dc03a91ce46578a42fe02863a Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 31 Dec 2014 10:36:10 +0100 Subject: [PATCH 2/5] Improve explanation, add new concepts. --- R-package/demo/create_sparse_matrix.R | 40 +++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index a333f3ac0..b89ab80fa 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -27,20 +27,28 @@ print(df) cat("Structure of the dataset\n") str(df) -# We remove the Age column which has no interest for the purpose of this demo. -df[,Age:= NULL] +# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. + +# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. +df[,AgeDiscret:= as.factor(round(Age/10,0))] + +# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). +df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] + +# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). +df[,ID:=NULL] # List the different values for the column Treatment: Placebo, Treated. cat("Values of the categorical feature Treatment\n") print(levels(df[,Treatment])) # Next step, we will transform the categorical data to dummy variables. -# This method is also called dummy encoding. +# This method is also called one hot encoding. # The purpose is to transform each value of each categorical feature in one binary feature. # -# For example, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary, meaning that it will contain the value 1 in the new column Placebo and 0 in the new column Treated, for observations which had the value Placebo in column Treatment before the transformation. +# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column Treated. # -# Formulae Improved~.-1 means transform all categorical features but column Improved to binary values. +# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values. # Column Improved is excluded because it will be our output column, the one we want to predict. sparse_matrix = sparse.model.matrix(Improved~.-1, data = df) @@ -55,11 +63,27 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] # Following is the same process as other demo cat("Learning...\n") -bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 3, - eta = 1, nround = 2,objective = "binary:logistic") +bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, + eta = 1, nround = 10,objective = "binary:logistic") xgb.dump(bst, 'xgb.model.dump', with.stats = T) # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') print(importance) -# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is having received a Placebo or not. +# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that there contribution is very low. + +# Does these results make sense? +# Let's check some Chi2 between each of these features and the outcome. + +print(chisq.test(df$Age, df$Y)) +# Pearson correlation between Age and illness disapearing is 35 + +print(chisq.test(df$AgeDiscret, df$Y)) +# Our first simplification of Age gives a Pearson correlation of 8. + +print(chisq.test(df$AgeCat, df$Y)) +# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. + +# As you can see, destroying information by simplying it won't improve your model. Chi2 just demonstrates that. +# It's even worse when you add some arbitrary rules. +# However, even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. \ No newline at end of file From 9998575c323a8d27eead69db14d1d0b9b76febe6 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 31 Dec 2014 10:47:57 +0100 Subject: [PATCH 3/5] Small text improvement --- R-package/demo/create_sparse_matrix.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index b89ab80fa..0e8c5efb6 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -82,8 +82,8 @@ print(chisq.test(df$AgeDiscret, df$Y)) # Our first simplification of Age gives a Pearson correlation of 8. print(chisq.test(df$AgeCat, df$Y)) -# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. +# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-) -# As you can see, destroying information by simplying it won't improve your model. Chi2 just demonstrates that. -# It's even worse when you add some arbitrary rules. -# However, even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. \ No newline at end of file +# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets. +# However it's almost always worse when you add some arbitrary rules. +# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario. \ No newline at end of file From 4f0ae53974082734463bc7b81f3ece60f4f85468 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 31 Dec 2014 10:49:05 +0100 Subject: [PATCH 4/5] text change --- R-package/demo/create_sparse_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index 0e8c5efb6..cf0fcac4d 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -84,6 +84,6 @@ print(chisq.test(df$AgeDiscret, df$Y)) print(chisq.test(df$AgeCat, df$Y)) # The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-) -# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets. +# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets. # However it's almost always worse when you add some arbitrary rules. # Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario. \ No newline at end of file From d07be2bb963fc66cf3a3816f0c3d0de8678c2374 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 31 Dec 2014 11:03:51 +0100 Subject: [PATCH 5/5] Username parameter is deprecated in install_function (see doc of the package for more information). --- R-package/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/README.md b/R-package/README.md index c0ca87195..ca65df9a3 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -6,7 +6,7 @@ For up-to-date version(which is recommended), please install from github. Window ```r require(devtools) -install_github('xgboost','tqchen',subdir='R-package') +install_github('tqchen/xgboost',subdir='R-package') ``` For stable version on CRAN, please run