From 5411e2a5001a3f5c9d0c35c66d74dc4ddb4b492d Mon Sep 17 00:00:00 2001 From: kalenhaha Date: Mon, 12 May 2014 22:21:07 +0800 Subject: [PATCH] Add LETOR MQ2008 for rank demo --- demo/rank/README | 14 +--------- demo/rank/{toy.conf => mq2008.conf} | 7 ++--- demo/rank/runexp.sh | 11 ++++++++ demo/rank/trans_data.py | 40 +++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 16 deletions(-) rename demo/rank/{toy.conf => mq2008.conf} (90%) create mode 100644 demo/rank/runexp.sh create mode 100644 demo/rank/trans_data.py diff --git a/demo/rank/README b/demo/rank/README index 43eb1e431..6b40516d1 100644 --- a/demo/rank/README +++ b/demo/rank/README @@ -1,13 +1 @@ -Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware - -Run: ./runexp.sh - -Format of input: LIBSVM format - -Format of ```featmap.txt: \n ```: - - Feature id must be from 0 to number of features, in sorted order. - - i means this feature is binary indicator feature - - q means this feature is a quantitative value, such as age, time, can be missing - - int means this feature is integer value (when int is hinted, the decision boundary will be integer) - -Explainations: https://github.com/tqchen/xgboost/wiki/Regression +The dataset for ranking demo is from LETOR04 MQ2008 fold1,http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx diff --git a/demo/rank/toy.conf b/demo/rank/mq2008.conf similarity index 90% rename from demo/rank/toy.conf rename to demo/rank/mq2008.conf index 3379826d6..1901b72f0 100644 --- a/demo/rank/toy.conf +++ b/demo/rank/mq2008.conf @@ -10,6 +10,7 @@ objective="rank:pairwise" #objective="lambdarank:map" #objective="lambdarank:ndcg" +num_feature=50 # Tree Booster Parameters # step size shrinkage bst:eta = 1.0 @@ -26,10 +27,10 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 0 # The path of training data -data = "toy.train" +data = "mq2008.train" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "toy.eval" +eval[test] = "mq2008.vali" # The path of test data -test:data = "toy.test" +test:data = "mq2008.test" diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh new file mode 100644 index 000000000..cb15f1dd0 --- /dev/null +++ b/demo/rank/runexp.sh @@ -0,0 +1,11 @@ +python trans_data.py train.txt mq2008.train mq2008.train.group + +python trans_data.py test.txt mq2008.test mq2008.test.group + +python trans_data.py vali.txt mq2008.vali mq2008.vali.group + +../../xgboost mq2008.conf + +../../xgboost mq2008.conf task=pred model_in=0002.model + +../../xgboost mq2008.conf task=dump model_in=0002.model name_dump=dump.raw.txt \ No newline at end of file diff --git a/demo/rank/trans_data.py b/demo/rank/trans_data.py new file mode 100644 index 000000000..fe8fde753 --- /dev/null +++ b/demo/rank/trans_data.py @@ -0,0 +1,40 @@ +import sys + +def save_data(group_data,output_feature,output_group): + if len(group_data) == 0: + return + + output_group.write(str(len(group_data))+"\n") + for data in group_data: + output_feature.write(data[0] + " " + " ".join(data[2:]) + "\n") + +if __name__ == "__main__": + if len(sys.argv) != 4: + print "Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]" + sys.exit(0) + + input = open(sys.argv[1]) + output_feature = open(sys.argv[2],"w") + output_group = open(sys.argv[3],"w") + + group_data = [] + group = "" + for line in input: + if not line: + break + if "#" in line: + line = line[:line.index("#")] + splits = line.strip().split(" ") + if splits[1] != group: + save_data(group_data,output_feature,output_group) + group_data = [] + group = splits[1] + group_data.append(splits) + + save_data(group_data,output_feature,output_group) + + input.close() + output_feature.close() + output_group.close() + +