具有贝叶斯优化的XGBoost和随机森林

让我们深入比较一下 - XGBoost与Random Forest

XGBoost每次构建一个决策树，每一个新的树都修正以前训练过的决策树所产生的错误。

随机森林

缺点

max f(x) （x∈A）

1. 观察初始点
2. 当n≤N时执行，使用所有可用的数据更新后验概率分布
• 设Xn是采集功能的最大化
• 观察yn= f(xn)
1. 返回一个解决方案：用最大的评估点

pip install bayesian-optimization

Import libraries

import pandas as pd

import numpy as np

from bayes_opt import BayesianOptimization

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

Bayesian optimization

def bayesian_optimization(dataset, function, parameters):

X_train, y_train, X_test, y_test = dataset

n_iterations = 5

gp_params = {"alpha": 1e-4}

BO = BayesianOptimization(function, parameters)

BO.maximize(n_iter=n_iterations, **gp_params)

return BO.max

def rfc_optimization(cv_splits):

def function(n_estimators, max_depth, min_samples_split):

return cross_val_score(

RandomForestClassifier(

n_estimators=int(max(n_estimators,0)),

max_depth=int(max(max_depth,1)),

min_samples_split=int(max(min_samples_split,2)),

n_jobs=-1,

random_state=42,

class_weight="balanced"),

X=X_train,

y=y_train,

cv=cv_splits,

scoring="roc_auc",

n_jobs=-1).mean()

parameters = {"n_estimators": (10, 1000),

"max_depth": (1, 150),

"min_samples_split": (2, 10)}

return function, parameters

def xgb_optimization(cv_splits, eval_set):

def function(eta, gamma, max_depth):

return cross_val_score(

xgb.XGBClassifier(

objective="binary:logistic",

learning_rate=max(eta, 0),

gamma=max(gamma, 0),

max_depth=int(max_depth),

seed=42,

scale_pos_weight = len(y_train[y_train == 0])/

len(y_train[y_train == 1])),

X=X_train,

y=y_train,

cv=cv_splits,

scoring="roc_auc",

fit_params={

"early_stopping_rounds": 10,

"eval_metric": "auc",

"eval_set": eval_set},

n_jobs=-1).mean()

parameters = {"eta": (0.001, 0.4),

"gamma": (0, 20),

"max_depth": (1, 2000)}

return function, parameters

Train model

def train(X_train, y_train, X_test, y_test, function, parameters):

dataset = (X_train, y_train, X_test, y_test)

cv_splits = 4

best_solution = bayesian_optimization(dataset, function, parameters)

params = best_solution["params"]

model = RandomForestClassifier(

n_estimators=int(max(params["n_estimators"], 0)),

max_depth=int(max(params["max_depth"], 1)),

min_samples_split=int(max(params["min_samples_split"], 2)),

n_jobs=-1,

random_state=42,

class_weight="balanced")

model.fit(X_train, y_train)

return model