XGBOOST应用及调参示例

该示例所用的数据可从该链接下载，提取码为3y90，数据说明可参考该网页。本文“模型调参”这一部分引用了这篇博客的步骤。关于XGBOOST的算法原理和参数介绍可参考文章GBDT和XGBOOST算法原理。

数据前处理

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline

### 读取数据，拆分训练集和测试集
data = pd.read_csv('Train.csv', encoding = "ISO-8859-1")
train, test = train_test_split(data,train_size=0.7,random_state=123,stratify=data['Disbursed'])
target = 'Disbursed'
predictors_raw = [col for col in train.columns if col!=target]
train_X, train_y = train[predictors_raw], train[target]
### 拆分数值特征和类别特征
category_cols = train_X.columns[train_X.dtypes==object].tolist()
category_cols.append('Var4') #根据数据说明，Var4虽然为数值类型，但实际为类别特征
numeric_cols = list(set(train_X.columns)-set(category_cols))

### 查看特征的缺失值比例
nulls_per_column = train_X.isnull().sum()
for c in train_X.columns:
    print('Ratio of missing value for variable {0}: {1}'.format(c,nulls_per_column[c]/train_X.shape[0]))
print('-----------------------------------------------------------')
#将缺失值较多的特征转化为是否为缺失值的判断
numeric_add = ['Loan_Amount_Submitted_Missing', 'Loan_Tenure_Submitted_Missing', 'EMI_Loan_Submitted_Missing', \
               'Interest_Rate_Missing', 'Processing_Fee_Missing'] 
numeric_cols = numeric_cols+numeric_add
#将生日转化为年龄
numeric_cols = numeric_cols+['Age']

### 查看类别特征中每个类别的样本个数
counts = dict()
for v in category_cols:
    counts[v] = train_X[v].value_counts()
#在类别特征中合并样本个数较少的类别
non_merge_city = [c for c in counts['City'].index if counts['City'][c]>=200]
non_merge_sa = [c for c in counts['Salary_Account'].index if counts['Salary_Account'][c]>=100]
non_merge_sr = [c for c in counts['Source'].index if counts['Source'][c]>=100]

### 去掉无意义或用处不大的特征
dropped_columns = ['ID','Lead_Creation_Date','LoggedIn','Employer_Name','DOB','Loan_Amount_Submitted', \
                   'Loan_Tenure_Submitted','EMI_Loan_Submitted','Interest_Rate','Processing_Fee']

### 特征工程 ###
def preprocess(X):
    X['Var4'] = X['Var4'].astype(str) #将数值类型转化为类别特征
    # 在类别特征中合并样本个数较少的类别
    X['City'] = X['City'].apply(lambda x: 'others' if x not in non_merge_city and not pd.isnull(x) else x)
    X['Salary_Account'] = X['Salary_Account'].apply(lambda x: 'others' if x not in non_merge_sa and not pd.isnull(x) else x)
    X['Source'] = X['Source'].apply(lambda x: 'others' if x not in non_merge_sr and not pd.isnull(x) else x)
    # 将生日转化为年龄
    X['Age'] = X['DOB'].apply(lambda x: 120 - int(x[-2:]))
    # 将缺失值较多的特征转化为是否为缺失值的判断
    X['Loan_Amount_Submitted_Missing'] = X['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
    X['Loan_Tenure_Submitted_Missing'] = X['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
    X['EMI_Loan_Submitted_Missing'] = X['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
    X['Interest_Rate_Missing'] = X['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
    X['Processing_Fee_Missing'] = X['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
    return X.drop(dropped_columns, axis=1) #去掉无意义或用处不大的特征
### 填充数值特征缺失值
numeric_imputer = [([feature], SimpleImputer(strategy="median")) for feature in numeric_cols if feature not in dropped_columns]
### 填充类别特征缺失值并进行one-hot encode
category_imputer = [([feature], [SimpleImputer(strategy='constant', fill_value='Missing'),LabelBinarizer()]) \
                    for feature in category_cols if feature not in dropped_columns]
### 合并数值特征和类别特征
numeric_categorical_union = DataFrameMapper(numeric_imputer+category_imputer,input_df=True,df_out=True)

### 建立数据管道
pipeline_data = Pipeline([("preprocessor", FunctionTransformer(preprocess, validate=False)), ("featureunion", numeric_categorical_union)])
train_3 = pipeline_data.fit_transform(train_X)
train_3[target] = train_y #train_3为在后续模型调参中使用的数据

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import metrics

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer, LabelBinarizer

from sklearn_pandas import DataFrameMapper

from sklearn.pipeline import Pipeline

### 读取数据，拆分训练集和测试集

data = pd.read_csv('Train.csv', encoding = "ISO-8859-1")

train, test = train_test_split(data,train_size=0.7,random_state=123,stratify=data['Disbursed'])

target = 'Disbursed'

predictors_raw = [col for col in train.columns if col!=target]

train_X, train_y = train[predictors_raw], train[target]

### 拆分数值特征和类别特征

category_cols = train_X.columns[train_X.dtypes==object].tolist()

category_cols.append('Var4') #根据数据说明，Var4虽然为数值类型，但实际为类别特征

numeric_cols = list(set(train_X.columns)-set(category_cols))

### 查看特征的缺失值比例

nulls_per_column = train_X.isnull().sum()

for c in train_X.columns:

print('Ratio of missing value for variable {0}: {1}'.format(c,nulls_per_column[c]/train_X.shape[0]))

print('-----------------------------------------------------------')

#将缺失值较多的特征转化为是否为缺失值的判断

numeric_add = ['Loan_Amount_Submitted_Missing', 'Loan_Tenure_Submitted_Missing', 'EMI_Loan_Submitted_Missing', \

'Interest_Rate_Missing', 'Processing_Fee_Missing']

numeric_cols = numeric_cols+numeric_add

#将生日转化为年龄

numeric_cols = numeric_cols+['Age']

### 查看类别特征中每个类别的样本个数

counts = dict()

for v in category_cols:

counts[v] = train_X[v].value_counts()

#在类别特征中合并样本个数较少的类别

non_merge_city = [c for c in counts['City'].index if counts['City'][c]>=200]

non_merge_sa = [c for c in counts['Salary_Account'].index if counts['Salary_Account'][c]>=100]

non_merge_sr = [c for c in counts['Source'].index if counts['Source'][c]>=100]

### 去掉无意义或用处不大的特征

dropped_columns = ['ID','Lead_Creation_Date','LoggedIn','Employer_Name','DOB','Loan_Amount_Submitted', \

'Loan_Tenure_Submitted','EMI_Loan_Submitted','Interest_Rate','Processing_Fee']

### 特征工程 ###

def preprocess(X):

X['Var4'] = X['Var4'].astype(str) #将数值类型转化为类别特征

# 在类别特征中合并样本个数较少的类别

X['City'] = X['City'].apply(lambda x: 'others' if x not in non_merge_city and not pd.isnull(x) else x)

X['Salary_Account'] = X['Salary_Account'].apply(lambda x: 'others' if x not in non_merge_sa and not pd.isnull(x) else x)

X['Source'] = X['Source'].apply(lambda x: 'others' if x not in non_merge_sr and not pd.isnull(x) else x)

# 将生日转化为年龄

X['Age'] = X['DOB'].apply(lambda x: 120 - int(x[-2:]))

# 将缺失值较多的特征转化为是否为缺失值的判断

X['Loan_Amount_Submitted_Missing'] = X['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)

X['Loan_Tenure_Submitted_Missing'] = X['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)

X['EMI_Loan_Submitted_Missing'] = X['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)

X['Interest_Rate_Missing'] = X['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)

X['Processing_Fee_Missing'] = X['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)

return X.drop(dropped_columns, axis=1) #去掉无意义或用处不大的特征

### 填充数值特征缺失值

numeric_imputer = [([feature], SimpleImputer(strategy="median")) for feature in numeric_cols if feature not in dropped_columns]

### 填充类别特征缺失值并进行one-hot encode

category_imputer = [([feature], [SimpleImputer(strategy='constant', fill_value='Missing'),LabelBinarizer()]) \

for feature in category_cols if feature not in dropped_columns]

### 合并数值特征和类别特征

numeric_categorical_union = DataFrameMapper(numeric_imputer+category_imputer,input_df=True,df_out=True)

### 建立数据管道

pipeline_data = Pipeline([("preprocessor", FunctionTransformer(preprocess, validate=False)), ("featureunion", numeric_categorical_union)])

train_3 = pipeline_data.fit_transform(train_X)

train_3[target] = train_y #train_3为在后续模型调参中使用的数据

模型调参

一、建立基础模型并使用early_stop调整迭代次数

import xgboost as xgb
### base model      
target = 'Disbursed'
predictors = [x for x in train_3.columns if x!=target]
xgb1 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, \
                         subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
### use early_stop in xgb.cv
def get_n_estimators(alg, dtrain, predictors, target, cv_folds=5, early_stopping_rounds=50):
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(dtrain[predictors], label=dtrain[target])
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, \
                      metrics='auc', early_stopping_rounds=early_stopping_rounds, stratified=True)   
    #Print model report:
    print("\nModel Report")
    print("Set n_estimators to {0}".format(cvresult.shape[0]))
    print(cvresult.tail(1)['test-auc-mean'])
    return cvresult.shape[0]
### get n_estimators
n_estimators = get_n_estimators(xgb1, train_3, predictors, target)

import xgboost as xgb

### base model

target = 'Disbursed'

predictors = [x for x in train_3.columns if x!=target]

xgb1 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

### use early_stop in xgb.cv

def get_n_estimators(alg, dtrain, predictors, target, cv_folds=5, early_stopping_rounds=50):

xgb_param = alg.get_xgb_params()

xgtrain = xgb.DMatrix(dtrain[predictors], label=dtrain[target])

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, \

metrics='auc', early_stopping_rounds=early_stopping_rounds, stratified=True)

#Print model report:

print("\nModel Report")

print("Set n_estimators to {0}".format(cvresult.shape[0]))

print(cvresult.tail(1)['test-auc-mean'])

return cvresult.shape[0]

### get n_estimators

n_estimators = get_n_estimators(xgb1, train_3, predictors, target)

二、调试参数max_depth和min_child_weight

from sklearn.model_selection import GridSearchCV

param_test1 = {'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=5, min_child_weight=1, gamma=0, \
                        subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
gsearch1 = GridSearchCV(estimator = alg, param_grid = param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch1.fit(train_3[predictors],train_3[target])
print(gsearch1.best_params_) #optimal: {'max_depth':5,'min_child_weight':5}
print(gsearch1.best_score_)

param_test2 = {'max_depth':[4,5,6],'min_child_weight':[4,5,6]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=5, min_child_weight=5, gamma=0, \
                        subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
gsearch2 = GridSearchCV(estimator = alg, param_grid = param_test2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch2.fit(train_3[predictors],train_3[target])
print(gsearch2.best_params_) #optimal: {'max_depth':4,'min_child_weight':6}
print(gsearch2.best_score_)

param_test2b = {'min_child_weight':[6,8,10,12]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0, \
                        subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
gsearch2b = GridSearchCV(estimator = alg, param_grid = param_test2b, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch2b.fit(train_3[predictors],train_3[target])
print(gsearch2b.best_params_) #optimal: {'min_child_weight':6}
print(gsearch2b.best_score_)

from sklearn.model_selection import GridSearchCV

param_test1 = {'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=5, min_child_weight=1, gamma=0, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

gsearch1 = GridSearchCV(estimator = alg, param_grid = param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch1.fit(train_3[predictors],train_3[target])

print(gsearch1.best_params_) #optimal: {'max_depth':5,'min_child_weight':5}

print(gsearch1.best_score_)

param_test2 = {'max_depth':[4,5,6],'min_child_weight':[4,5,6]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=5, min_child_weight=5, gamma=0, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

gsearch2 = GridSearchCV(estimator = alg, param_grid = param_test2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch2.fit(train_3[predictors],train_3[target])

print(gsearch2.best_params_) #optimal: {'max_depth':4,'min_child_weight':6}

print(gsearch2.best_score_)

param_test2b = {'min_child_weight':[6,8,10,12]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

gsearch2b = GridSearchCV(estimator = alg, param_grid = param_test2b, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch2b.fit(train_3[predictors],train_3[target])

print(gsearch2b.best_params_) #optimal: {'min_child_weight':6}

print(gsearch2b.best_score_)

三、调试参数gamma并调整迭代次数

param_test3 = {'gamma':[i/10.0 for i in range(0,5)]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0, \
                        subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
gsearch3 = GridSearchCV(estimator = alg, param_grid = param_test3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch3.fit(train_3[predictors],train_3[target])
print(gsearch3.best_params_) #optimal: {'gamma':0.2}
print(gsearch3.best_score_)

xgb2 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=4, min_child_weight=6, gamma=0.2, \
                         subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
n_estimators = get_n_estimators(xgb2, train_3, predictors, target)

param_test3 = {'gamma':[i/10.0 for i in range(0,5)]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

gsearch3 = GridSearchCV(estimator = alg, param_grid = param_test3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch3.fit(train_3[predictors],train_3[target])

print(gsearch3.best_params_) #optimal: {'gamma':0.2}

print(gsearch3.best_score_)

xgb2 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=4, min_child_weight=6, gamma=0.2, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

n_estimators = get_n_estimators(xgb2, train_3, predictors, target)

四、调试参数subsample和colsample_bytree

param_test4 = {'subsample':[i/10.0 for i in range(6,11)], 'colsample_bytree':[i/10.0 for i in range(6,11)]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \
                        subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)
gsearch4 = GridSearchCV(estimator = alg, param_grid = param_test4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch4.fit(train_3[predictors],train_3[target])
print(gsearch4.best_params_) #optimal: {'colsample_bytree': 0.7, 'subsample': 0.7}
print(gsearch4.best_score_)

param_test5 = {'subsample':[i/100.0 for i in range(65,80,5)], 'colsample_bytree':[i/100.0 for i in range(65,80,5)]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \
                        subsample=0.7, colsample_bytree=0.7, objective= 'binary:logistic', nthread=4, seed=27)
gsearch5 = GridSearchCV(estimator = alg, param_grid = param_test5, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch5.fit(train_3[predictors],train_3[target])
print(gsearch5.best_params_) #optimal: {'colsample_bytree': 0.75, 'subsample': 0.7}
print(gsearch5.best_score_)

param_test4 = {'subsample':[i/10.0 for i in range(6,11)], 'colsample_bytree':[i/10.0 for i in range(6,11)]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \

subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, seed=27)

gsearch4 = GridSearchCV(estimator = alg, param_grid = param_test4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch4.fit(train_3[predictors],train_3[target])

print(gsearch4.best_params_) #optimal: {'colsample_bytree': 0.7, 'subsample': 0.7}

print(gsearch4.best_score_)

param_test5 = {'subsample':[i/100.0 for i in range(65,80,5)], 'colsample_bytree':[i/100.0 for i in range(65,80,5)]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \

subsample=0.7, colsample_bytree=0.7, objective= 'binary:logistic', nthread=4, seed=27)

gsearch5 = GridSearchCV(estimator = alg, param_grid = param_test5, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch5.fit(train_3[predictors],train_3[target])

print(gsearch5.best_params_) #optimal: {'colsample_bytree': 0.75, 'subsample': 0.7}

print(gsearch5.best_score_)

五、调试参数reg_alpha

param_test6 = {'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \
                        subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)
gsearch6 = GridSearchCV(estimator = alg, param_grid = param_test6, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch6.fit(train_3[predictors],train_3[target])
print(gsearch6.best_params_) #optimal: {'reg_alpha': 0.01}
print(gsearch6.best_score_)

param_test7 = {'reg_alpha':[0.001, 0.005, 0.01, 0.05]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \
                        subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)
gsearch7 = GridSearchCV(estimator = alg, param_grid = param_test7, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch7.fit(train_3[predictors],train_3[target])
print(gsearch7.best_params_) #optimal: {'reg_alpha': 0.01}
print(gsearch7.best_score_)

param_test6 = {'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, \

subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)

gsearch6 = GridSearchCV(estimator = alg, param_grid = param_test6, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch6.fit(train_3[predictors],train_3[target])

print(gsearch6.best_params_) #optimal: {'reg_alpha': 0.01}

print(gsearch6.best_score_)

param_test7 = {'reg_alpha':[0.001, 0.005, 0.01, 0.05]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \

subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)

gsearch7 = GridSearchCV(estimator = alg, param_grid = param_test7, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch7.fit(train_3[predictors],train_3[target])

print(gsearch7.best_params_) #optimal: {'reg_alpha': 0.01}

print(gsearch7.best_score_)

六、调试参数reg_lambda并调整迭代次数

param_test8 = {'reg_lambda':[0, 0.01, 0.1, 1, 10, 100]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \
                        subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)
gsearch8 = GridSearchCV(estimator = alg, param_grid = param_test8, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch8.fit(train_3[predictors],train_3[target])
print(gsearch8.best_params_) #optimal: {'reg_lambda': 1}
print(gsearch8.best_score_)

param_test9 = {'reg_lambda':[0.5, 0.7, 1, 3, 5]}
alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \
                        subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)
gsearch9 = GridSearchCV(estimator = alg, param_grid = param_test9, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch9.fit(train_3[predictors],train_3[target])
print(gsearch9.best_params_) #optimal: {'reg_lambda': 1}
print(gsearch9.best_score_)

xgb3 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=4, min_child_weight=6, gamma=0.2, \
                         reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \
                         objective= 'binary:logistic', nthread=4, seed=27)
n_estimators = get_n_estimators(xgb3, train_3, predictors, target)

param_test8 = {'reg_lambda':[0, 0.01, 0.1, 1, 10, 100]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \

subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)

gsearch8 = GridSearchCV(estimator = alg, param_grid = param_test8, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch8.fit(train_3[predictors],train_3[target])

print(gsearch8.best_params_) #optimal: {'reg_lambda': 1}

print(gsearch8.best_score_)

param_test9 = {'reg_lambda':[0.5, 0.7, 1, 3, 5]}

alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=4, min_child_weight=6, gamma=0.2, reg_alpha=0.01, \

subsample=0.7, colsample_bytree=0.75, objective= 'binary:logistic', nthread=4, seed=27)

gsearch9 = GridSearchCV(estimator = alg, param_grid = param_test9, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch9.fit(train_3[predictors],train_3[target])

print(gsearch9.best_params_) #optimal: {'reg_lambda': 1}

print(gsearch9.best_score_)

xgb3 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=4, min_child_weight=6, gamma=0.2, \

reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \

objective= 'binary:logistic', nthread=4, seed=27)

n_estimators = get_n_estimators(xgb3, train_3, predictors, target)

七、减少学习率（learning_rate）并调整迭代次数，通过函数get_n_estimators的输出信息比较算法是否有改进

xgb4 = xgb.XGBClassifier(learning_rate=0.01, n_estimators=5000, max_depth=4, min_child_weight=6, gamma=0.2, \
                         reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \
                         objective= 'binary:logistic', nthread=4, seed=27)
n_estimators_lr = get_n_estimators(xgb4, train_3, predictors, target)

xgb4 = xgb.XGBClassifier(learning_rate=0.01, n_estimators=5000, max_depth=4, min_child_weight=6, gamma=0.2, \

reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \

objective= 'binary:logistic', nthread=4, seed=27)

n_estimators_lr = get_n_estimators(xgb4, train_3, predictors, target)

训练并验证模型

### 调试后的分类器
tuned_xgb = xgb.XGBClassifier(learning_rate=0.01, n_estimators=n_estimators_lr, max_depth=4, min_child_weight=6, gamma=0.2, \
                              reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \
                              objective= 'binary:logistic', nthread=4, seed=27)
### 训练模型
pipeline = Pipeline([("feature_engineer", pipeline_data), ("classifier", tuned_xgb)])
pipeline.fit(train_X, train_y)
### 特征重要性
feat_imp = pd.Series(pipeline.named_steps['classifier'].get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances', figsize=(20,6)) #如下图所示
plt.ylabel('Feature Importance Score')
### 预测
print(pipeline.predict_proba(test.iloc[[1]][predictors_raw])) #individual prediction
predprob=pipeline.predict_proba(test[predictors_raw])[:,1] #test data predictions
print("AUC Score (Test): %f" % metrics.roc_auc_score(test[target], predprob)) #AUC Score (Test): 0.8572

### 调试后的分类器

tuned_xgb = xgb.XGBClassifier(learning_rate=0.01, n_estimators=n_estimators_lr, max_depth=4, min_child_weight=6, gamma=0.2, \

reg_alpha=0.01, reg_lambda=1, subsample=0.7, colsample_bytree=0.75, \

objective= 'binary:logistic', nthread=4, seed=27)

### 训练模型

pipeline = Pipeline([("feature_engineer", pipeline_data), ("classifier", tuned_xgb)])

pipeline.fit(train_X, train_y)

### 特征重要性

feat_imp = pd.Series(pipeline.named_steps['classifier'].get_booster().get_fscore()).sort_values(ascending=False)

feat_imp.plot(kind='bar', title='Feature Importances', figsize=(20,6)) #如下图所示

plt.ylabel('Feature Importance Score')

### 预测

print(pipeline.predict_proba(test.iloc[[1]][predictors_raw])) #individual prediction

predprob=pipeline.predict_proba(test[predictors_raw])[:,1] #test data predictions

print("AUC Score (Test): %f" % metrics.roc_auc_score(test[target], predprob)) #AUC Score (Test): 0.8572

数据前处理

模型调参

训练并验证模型

Leave a Comment Cancel Reply