Use black on more Python files. (#8137)

This commit is contained in:
Jiaming Yuan
2022-08-11 01:38:11 +08:00
committed by GitHub
parent bdb291f1c2
commit 570f8ae4ba
14 changed files with 183 additions and 133 deletions

View File

@@ -19,13 +19,14 @@ Also, see the tutorial for using XGBoost with categorical data:
"""
from __future__ import annotations
from time import time
import os
from tempfile import TemporaryDirectory
from time import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

View File

@@ -16,11 +16,13 @@ categorical data.
.. versionadded:: 1.5.0
"""
import pandas as pd
import numpy as np
import xgboost as xgb
from typing import Tuple
import numpy as np
import pandas as pd
import xgboost as xgb
def make_categorical(
n_samples: int, n_features: int, n_categories: int, onehot: bool

View File

@@ -1,35 +1,34 @@
'''
"""
Collection of examples for using xgboost.spark estimator interface
==================================================================
@author: Weichen Xu
'''
"""
import sklearn.datasets
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand
from pyspark.ml.linalg import Vectors
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
spark = SparkSession.builder.master("local[*]").getOrCreate()
def create_spark_df(X, y):
return spark.createDataFrame(
spark.sparkContext.parallelize([
(Vectors.dense(features), float(label))
for features, label in zip(X, y)
]),
["features", "label"]
spark.sparkContext.parallelize(
[(Vectors.dense(features), float(label)) for features, label in zip(X, y)]
),
["features", "label"],
)
# load diabetes dataset (regression dataset)
diabetes_X, diabetes_y = sklearn.datasets.load_diabetes(return_X_y=True)
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = \
train_test_split(diabetes_X, diabetes_y, test_size=0.3, shuffle=True)
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
diabetes_X, diabetes_y, test_size=0.3, shuffle=True
)
diabetes_train_spark_df = create_spark_df(diabetes_X_train, diabetes_y_train)
diabetes_test_spark_df = create_spark_df(diabetes_X_test, diabetes_y_test)
@@ -38,25 +37,36 @@ diabetes_test_spark_df = create_spark_df(diabetes_X_test, diabetes_y_test)
xgb_regressor = SparkXGBRegressor(max_depth=5)
xgb_regressor_model = xgb_regressor.fit(diabetes_train_spark_df)
transformed_diabetes_test_spark_df = xgb_regressor_model.transform(diabetes_test_spark_df)
transformed_diabetes_test_spark_df = xgb_regressor_model.transform(
diabetes_test_spark_df
)
regressor_evaluator = RegressionEvaluator(metricName="rmse")
print(f"regressor rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df)}")
print(
f"regressor rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df)}"
)
diabetes_train_spark_df2 = diabetes_train_spark_df.withColumn(
"validationIndicatorCol", rand(1) > 0.7
)
# train xgboost regressor model with validation dataset
xgb_regressor2 = SparkXGBRegressor(max_depth=5, validation_indicator_col="validationIndicatorCol")
xgb_regressor2 = SparkXGBRegressor(
max_depth=5, validation_indicator_col="validationIndicatorCol"
)
xgb_regressor_model2 = xgb_regressor2.fit(diabetes_train_spark_df2)
transformed_diabetes_test_spark_df2 = xgb_regressor_model2.transform(diabetes_test_spark_df)
print(f"regressor2 rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df2)}")
transformed_diabetes_test_spark_df2 = xgb_regressor_model2.transform(
diabetes_test_spark_df
)
print(
f"regressor2 rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df2)}"
)
# load iris dataset (classification dataset)
iris_X, iris_y = sklearn.datasets.load_iris(return_X_y=True)
iris_X_train, iris_X_test, iris_y_train, iris_y_test = \
train_test_split(iris_X, iris_y, test_size=0.3, shuffle=True)
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(
iris_X, iris_y, test_size=0.3, shuffle=True
)
iris_train_spark_df = create_spark_df(iris_X_train, iris_y_train)
iris_test_spark_df = create_spark_df(iris_X_test, iris_y_test)
@@ -74,9 +84,13 @@ iris_train_spark_df2 = iris_train_spark_df.withColumn(
)
# train xgboost classifier model with validation dataset
xgb_classifier2 = SparkXGBClassifier(max_depth=5, validation_indicator_col="validationIndicatorCol")
xgb_classifier2 = SparkXGBClassifier(
max_depth=5, validation_indicator_col="validationIndicatorCol"
)
xgb_classifier_model2 = xgb_classifier2.fit(iris_train_spark_df2)
transformed_iris_test_spark_df2 = xgb_classifier_model2.transform(iris_test_spark_df)
print(f"classifier2 f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df2)}")
print(
f"classifier2 f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df2)}"
)
spark.stop()